From be1503a0c78fd4c4d903b1ffbf61964659725bb6 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Tue, 3 Jan 2023 15:37:42 +0000 Subject: [PATCH 001/123] First changes to custom_op for RTL-based MVAU --- .../matrixvectoractivation_rtl.py | 1036 +++++++++++++++++ 1 file changed, 1036 insertions(+) create mode 100644 src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py new file mode 100644 index 0000000000..c8a0aa675b --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py @@ -0,0 +1,1036 @@ +# Copyright (c) 2020, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import math +import numpy as np +import os +import textwrap +import warnings +from qonnx.core.datatype import DataType +from qonnx.util.basic import ( + calculate_matvec_accumulator_range, + interleave_matrix_outer_dim_from_partitions, + roundup_to_integer_multiple, +) + +from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp +from finn.util.data_packing import ( + npy_to_rtlsim_input, + numpy_to_hls_code, + pack_innermost_dim_as_hex_string, + rtlsim_output_to_npy, +) + +from . import templates + +# ONNX i/o tensor shape assumptions for MatrixVectorActivation: +# input 0 is the input tensor, shape (.., i_size) = (..., MW) +# input 1 is the weight tensor, shape (i_size, o_size) = (MW, MH) +# (optional) input 2 is the thresholds tensor, shape (o_size, n_thres) +# output 0 is the output tensor, shape (.., o_size) = (..., MH) +# the ... here can be any shape (representing groups of vectors) + + +class MatrixVectorActivation_rtl(HLSCustomOp): + """Class that corresponds to finn-hls Matrix_Vector_Activate(_Stream)_Batch + function.""" + + def __init__(self, onnx_node): + super().__init__(onnx_node) + self.decoupled_wrapper = templates.decoupled_wrapper + + def get_nodeattr_types(self): + my_attrs = { + "PE": ("i", True, 0), + "SIMD": ("i", True, 0), + "MW": ("i", True, 0), + "MH": ("i", True, 0), + "resType": ("s", False, "lut", {"auto", "lut", "dsp"}), + "ActVal": ("i", False, 0), + # FINN DataTypes for inputs, weights, outputs + "inputDataType": ("s", True, ""), + "weightDataType": ("s", True, ""), + "outputDataType": ("s", True, ""), + # FINN DataType for accumulator -- auto-computed and updated + "accDataType": ("s", False, "INT32"), + # use xnor-popcount for binary weights/inputs, thus treating them + # as bipolar + "binaryXnorMode": ("i", False, 0, {0, 1}), + # no-activation mode (produce accumulators) + "noActivation": ("i", False, 0, {0, 1}), + # number of input vectors, examples: + # [1] is a single vector (like a FC layer with batch=1) + # [4] is four vectors (like a FC layer with batch=4) + # [1, 4, 4] is four * four vectors (like a conv layer with batch=1) + "numInputVectors": ("ints", False, [1]), + # memory mode for the FC weights + # const -- embedded weights, default, long compile/synth times + # decoupled -- streaming weights with weight streamer packaged inside IP + # external -- streaming weights with external streamer + "mem_mode": ("s", False, "const", {"const", "decoupled", "external"}), + # FPGA resource type for memories in decoupled mode + # auto -- let Vivado decide + # block -- use BRAM + # distributed -- use LUTRAM + # ultra -- use UltraRAM (URAM), must have runtime_writeable_weights=1 + # see also https://www.xilinx.com/support/answers/38070.html + "ram_style": ( + "s", + False, + "auto", + {"auto", "block", "distributed", "ultra"}, + ), + # FPGA resource type for threshold memories (if noActivation is False) + # auto -- let Vivado decide + # block -- use BRAM + # distributed -- use LUTRAM + "ram_style_thresholds": ( + "s", + False, + "auto", + {"auto", "block", "distributed"}, + ), + # (mem_mode = decoupled only) whether weights will be writable through + # an AXI-lite interface during runtime + # 1 for enabled, 0 for disabled. + # see finn-rtllib/memstream/doc/README for more about the memory + # address map used for writable weights + # IMPORTANT: After using AXI lite to either read or write the weights, + # always "flush" the accelerator by first passing a dummy input + # vector through the accelerator. This will get rid of any old + # weight data from the weight FIFOs. + "runtime_writeable_weights": ("i", False, 0, {0, 1}), + } + my_attrs.update(super().get_nodeattr_types()) + return my_attrs + + def calc_wmem(self): + """Calculates and returns WMEM.""" + mw = self.get_nodeattr("MW") + mh = self.get_nodeattr("MH") + pe = self.get_nodeattr("PE") + simd = self.get_nodeattr("SIMD") + assert mh % pe == 0, "Requirement MH divisable by PE is violated." + assert mw % simd == 0, "Requirement MW divisable by SIMD is violated." + wmem = mw * mh // (pe * simd) + return wmem + + def calc_tmem(self): + """Calculates and returns TMEM.""" + assert self.get_nodeattr("noActivation")==1, "RTL-based MVAU does not support thresholding currently, please infer a standalone Thresholding_Batch layer" + return 0 + + def make_shape_compatible_op(self, model): + oshape = self.get_normal_output_shape() + return super().make_const_shape_op(oshape) + + def infer_node_datatype(self, model): + node = self.onnx_node + idt = model.get_tensor_datatype(node.input[0]) + if idt != self.get_input_datatype(): + warn_str = "inputDataType changing for %s: %s -> %s " % ( + node.name, + str(self.get_input_datatype()), + str(idt), + ) + warnings.warn(warn_str) + self.set_nodeattr("inputDataType", idt.name) + # set output datatype from property + odt = self.get_output_datatype() + model.set_tensor_datatype(node.output[0], odt) + + def verify_node(self): + info_messages = [] + # verify that "backend" is set to "fpgadataflow" + backend_value = self.get_nodeattr("backend") + if backend_value == "fpgadataflow": + info_messages.append("Attribute backend is set correctly") + else: + info_messages.append('Attribute backend should be set to "fpgadataflow"') + + # verify that all necessary attributes exist + # TODO collect automatically from get_nodeattr_types + try: + self.get_nodeattr("code_gen_dir_cppsim") + self.get_nodeattr("executable_path") + self.get_nodeattr("resType") + self.get_nodeattr("MW") + self.get_nodeattr("MH") + self.get_nodeattr("SIMD") + self.get_nodeattr("PE") + self.get_nodeattr("inputDataType") + self.get_nodeattr("weightDataType") + self.get_nodeattr("outputDataType") + info_messages.append("All necessary attributes exist") + except Exception: + info_messages.append( + """The required MatrixVectorActivation attributes do not exist.""" + ) + + # verify the number of inputs depending on noActivation value + # check noActivation value to determine the number of inputs + no_act = self.get_nodeattr("noActivation") + + if no_act == 1: + if len(self.onnx_node.input) == 2: + info_messages.append("The number of inputs is correct") + else: + info_messages.append( + """RTL-based MatrixVectorActivation needs in no + activation mode 2 inputs (data input and weights)""" + ) + elif no_act == 0: + info_messages.append("RTL-based MVAU does not support thresholding currently, please infer a standalone Thresholding_Batch layer") + else: + info_messages.append( + """noActivation attribute contains {} should + be 1 for RTL-based MatrixVectorActivation""".format( + no_act + ) + ) + + mem_mode = self.get_nodeattr("mem_mode") + + if mem_mode != "decoupled": + info_messages.append("RTL-based MVAU supports only decoupled weights currently") + + return info_messages + + def uram_estimation(self): + P = self.get_nodeattr("PE") + Q = self.get_nodeattr("SIMD") + wdt = self.get_weight_datatype() + W = wdt.bitwidth() + D_in = self.get_nodeattr("MW") + D_out = self.get_nodeattr("MH") + omega = (D_in * D_out) / (Q * P) + mem_width = Q * W * P + mmode = self.get_nodeattr("mem_mode") + mstyle = self.get_nodeattr("ram_style") + if ( + (mmode == "decoupled" and mstyle != "ultra") + or (mmode == "const" and self.calc_wmem() <= 128) + or (mmode == "external") + ): + return 0 + width_multiplier = math.ceil(mem_width / 72) + depth_multiplier = math.ceil(omega / 4096) + return width_multiplier * depth_multiplier + + def bram_estimation(self): + """Calculates resource estimation for BRAM based on: + - FINN-R: An End-to-End Deep-Learning Framework for Fast + Exploration of Quantized Neural Networks + - M. Blott, T. B. Preusser, N. J. Fraser, G. Gambardella, K. O'Brien, + Y. Umuroglu, M. Leeser and K. Vissers + - 12. Sep 2018 + """ + # TODO add in/out FIFO contributions + P = self.get_nodeattr("PE") + Q = self.get_nodeattr("SIMD") + wdt = self.get_weight_datatype() + W = wdt.bitwidth() + D_in = self.get_nodeattr("MW") + D_out = self.get_nodeattr("MH") + omega = (D_in * D_out) / (Q * P) + mem_width = Q * W * P + mmode = self.get_nodeattr("mem_mode") + mstyle = self.get_nodeattr("ram_style") + if ( + (mmode == "decoupled" and mstyle in ["distributed", "ultra"]) + or (mmode == "const" and self.calc_wmem() <= 128) + or (mmode == "external") + ): + return 0 + # assuming SDP mode RAMB18s (see UG573 Table 1-10) + # assuming decoupled (RTL) memory, which is more efficient than const (HLS) + if mem_width == 1: + return math.ceil(omega / 16384) + elif mem_width == 2: + return math.ceil(omega / 8192) + elif mem_width <= 4: + return (math.ceil(omega / 4096)) * (math.ceil(mem_width / 4)) + elif mem_width <= 9: + return (math.ceil(omega / 2048)) * (math.ceil(mem_width / 9)) + elif mem_width <= 18 or omega > 512: + return (math.ceil(omega / 1024)) * (math.ceil(mem_width / 18)) + else: + return (math.ceil(omega / 512)) * (math.ceil(mem_width / 36)) + + def bram_efficiency_estimation(self): + wdt = self.get_weight_datatype() + W = wdt.bitwidth() + D_in = self.get_nodeattr("MW") + D_out = self.get_nodeattr("MH") + bram16_est = self.bram_estimation() + if bram16_est == 0: + return 1 + wbits = W * D_in * D_out + bram16_est_capacity = bram16_est * 36 * 512 + return wbits / bram16_est_capacity + + def uram_efficiency_estimation(self): + """Function for URAM efficiency estimation: actual parameter storage + needed divided by the allocated URAM storage (from estimation)""" + wdt = self.get_weight_datatype() + W = wdt.bitwidth() + D_in = self.get_nodeattr("MW") + D_out = self.get_nodeattr("MH") + uram_est = self.uram_estimation() + if uram_est == 0: + return 1 + wbits = W * D_in * D_out + uram_est_capacity = uram_est * 72 * 4096 + return wbits / uram_est_capacity + +#TODO: FIX + def lut_estimation(self): + """Calculates resource estimations for LUTs based on: + - FINN-R: An End-to-End Deep-Learning Framework for Fast + Exploration of Quantized Neural Networks + - M. Blott, T. B. Preusser, N. J. Fraser, G. Gambardella, K. O'Brien, + Y. Umuroglu, M. Leeser and K. Vissers + - 12. Sep 2018 + """ + # TODO add in/out FIFO contributions + P = self.get_nodeattr("PE") + Q = self.get_nodeattr("SIMD") + MW = self.get_nodeattr("MW") + wdt = self.get_weight_datatype() + W = wdt.bitwidth() + # determine tdt with input and weight data types + idt = self.get_input_datatype() + A = idt.bitwidth() + # parameters from experiments in paper mentioned above + c0 = 300 + c1 = 1.1 + c2 = 0 + mmode = self.get_nodeattr("mem_mode") + mstyle = self.get_nodeattr("ram_style") + if (mmode == "decoupled" and mstyle == "distributed") or ( + mmode == "const" and self.calc_wmem() <= 128 + ): + c2 = (P * Q * W) * math.ceil(self.calc_wmem() / 64) + + # multiplication + res_type = self.get_nodeattr("resType") + if res_type == "dsp": + mult_luts = 0 + else: + mult_luts = Q * (2 * math.ceil((W + A) / 6) - 1) * (W + A) + # adder tree + addertree_luts = (W + A) * (2 * Q - 1) + # accumulator + acc_bits = W + A + np.ceil(math.log(MW, 2)) + acc_luts = acc_bits + # thresholds and threshold comparators + thr_luts = 0 + comp_luts = 0 + noact = self.get_nodeattr("noActivation") + if noact == 0: + odt = self.get_output_datatype() + B = odt.bitwidth() + thr_luts = (2**B - 1) * acc_bits * math.ceil(self.calc_tmem() / 64) + comp_luts = (2**B - 1) * acc_bits + + return int( + c0 + + c1 * (P * (mult_luts + addertree_luts + acc_luts + thr_luts + comp_luts)) + + c2 + ) + +#TODO: FIX + def dsp_estimation(self): + # multiplication + P = self.get_nodeattr("PE") + res_type = self.get_nodeattr("resType") + Q = self.get_nodeattr("SIMD") + wdt = self.get_weight_datatype() + W = wdt.bitwidth() + idt = self.get_input_datatype() + A = idt.bitwidth() + if res_type == "dsp": + mult_dsp = P * Q * np.ceil((W + A) / 48) # TODO: more accurate modelling + else: + mult_dsp = 0 + return int(mult_dsp) + +#TODO: FIX + def get_exp_cycles(self): + pe = self.get_nodeattr("PE") + simd = self.get_nodeattr("SIMD") + num_inp_vec = self.get_nodeattr("numInputVectors") + mh = self.get_nodeattr("MH") + mw = self.get_nodeattr("MW") + # since mmv != 1 is not supported yet, we set mmv for now to 1 + mmv = 1 + exp_cycles = (mh / pe) * (mw / simd) * np.prod(num_inp_vec) / mmv + return int(exp_cycles) + + def get_input_datatype(self, ind=0): + """Returns FINN DataType of input.""" + # when performing FIFO insertion on an FC layer with ext weights, the ind + # parameter can be > 0 (referring to the weights) so handle that here + if ind == 0: + return DataType[self.get_nodeattr("inputDataType")] + elif ind == 1: + return DataType[self.get_nodeattr("weightDataType")] + else: + raise Exception("Undefined input ind for this layer type") + + def get_weight_datatype(self): + """Returns FINN DataType of weights.""" + return DataType[self.get_nodeattr("weightDataType")] + + def get_output_datatype(self, ind=0): + """Returns FINN DataType of output.""" + return DataType[self.get_nodeattr("outputDataType")] + + def get_instream_width(self, ind=0): + i_bits = self.get_input_datatype().bitwidth() + assert i_bits<=9, "RTL-based MVAU only supports activations with bit-width up to 9-bits" + in_width = i_bits * self.get_nodeattr("SIMD") + return in_width + + def get_outstream_width(self, ind=0): + o_bits = self.get_output_datatype().bitwidth() + out_width = o_bits * self.get_nodeattr("PE") + return out_width + + def get_weightstream_width(self): + """Returns weight stream width. Used only in decoupled mode.""" + if ( + self.get_nodeattr("mem_mode") == "decoupled" + or self.get_nodeattr("mem_mode") == "external" + ): + pe = self.get_nodeattr("PE") + simd = self.get_nodeattr("SIMD") + wp = self.get_weight_datatype().bitwidth() + w_width = pe * simd * wp + assert wp<=8, "RTL-based MVAU only supports weights with bit-width up to 8-bits" + return w_width + else: + return 0 + + def get_weightstream_width_padded(self): + """Returns weight stream width padded to a multiple of 8. This is required + by the AXI Stream spec. Used in decoupled mode.""" + weight_width = self.get_weightstream_width() + return roundup_to_integer_multiple(weight_width, 8) + + def get_ap_int_max_w(self): + # base class impl (max of inp/out stream widths) + max_of_io = super().get_ap_int_max_w() + # decoupled mode weight stream + weightstream = self.get_weightstream_width() + # single PE weight entry + weight_bits = self.get_weight_datatype().bitwidth() + simd = self.get_nodeattr("SIMD") + single_pe_w = simd * weight_bits + return max([weightstream, max_of_io, single_pe_w]) + + def get_folded_input_shape(self, ind=0): + mw = self.get_nodeattr("MW") + mh = self.get_nodeattr("MH") + simd = self.get_nodeattr("SIMD") + pe = self.get_nodeattr("PE") + sf = mw // simd + nf = mh // pe + vecs = list(self.get_nodeattr("numInputVectors")) + + if ind == 0: + # calculate shape of input 0 + folded_input_shape = tuple(vecs + [sf, simd]) + elif ind == 1 and self.get_nodeattr("mem_mode") == "external": + # calculate shape of input 1 (weights) + folded_input_shape = tuple(vecs + [sf * nf, simd * pe]) + else: + raise Exception("Undefined input shape for requested input") + + return folded_input_shape + + def get_folded_output_shape(self, ind=0): + mh = self.get_nodeattr("MH") + pe = self.get_nodeattr("PE") + nf = mh // pe + vecs = list(self.get_nodeattr("numInputVectors")) + folded_output_shape = tuple(vecs + [nf, pe]) + return folded_output_shape + + def get_normal_input_shape(self, ind=0): + mw = self.get_nodeattr("MW") + vecs = list(self.get_nodeattr("numInputVectors")) + normal_input_shape = tuple(vecs + [mw]) + return normal_input_shape + + def get_normal_output_shape(self, ind=0): + mh = self.get_nodeattr("MH") + vecs = list(self.get_nodeattr("numInputVectors")) + normal_output_shape = tuple(vecs + [mh]) + return normal_output_shape + + def get_number_output_values(self): + nf = np.prod(self.get_folded_output_shape()[:-1]) + return nf + + def get_hls_compatible_weight_tensor(self, orig_weight_matrix): + """Convert the original numpy weight matrix orig_weight_matrix into + a form suitable for passing to the hlslib call: + * ensure MH % PE == 0 and MW % SIMD == 0 + * for bipolar {-1,+1} weights, convert to binary {0, 1} + * interleave rows between PEs + * reshape into (1, PE, WMEM, SIMD) and return + """ + mw = self.get_nodeattr("MW") + mh = self.get_nodeattr("MH") + pe = self.get_nodeattr("PE") + simd = self.get_nodeattr("SIMD") + wmem = self.calc_wmem() + assert orig_weight_matrix.shape == ( + mw, + mh, + ), """Weights matrix doesn't + have expected shape (mw, mh)""" + assert mw % simd == 0, "Requirement MH divisable by SIMD is violated." + assert mh % pe == 0, "Requirement MH divisable by PE is violated." + # start by transposing the original weight matrix, since ONNX and + # finn-hlslib use different assumptions + # ONNX uses (in_features, out_features) and matmul(x, W) + # finn-hlslib uses (out_features, in_features) and matmul(W, x) + ret = orig_weight_matrix.T + # interleave rows between PEs and reshape + # distribute rows between PEs + ret = interleave_matrix_outer_dim_from_partitions(ret, pe) + # create SIMD as innermost dimension and add a dummy outer dim + ret = ret.reshape(1, pe, wmem, simd) + # reverse the SIMD dimension + ret = np.flip(ret, axis=-1) + return ret + + def minimize_accumulator_width(self, model): + weights = model.get_initializer(self.onnx_node.input[1]) + idt = self.get_input_datatype() + # calculate minimum and maximum values of accumulator + (acc_min, acc_max) = calculate_matvec_accumulator_range(weights, idt) + if acc_min < 0: + if abs(acc_min) > acc_max: + adt = DataType.get_smallest_possible(acc_min) + else: + adt = DataType.get_smallest_possible(-acc_max - 1) + else: + adt = DataType.get_smallest_possible(acc_max) + # ensure a datatype divisible by 8-bits in case this is the last node + bw = roundup_to_integer_multiple(adt.bitwidth(), 8) + new_adt_name = adt.name.replace(str(adt.bitwidth()), str(bw)) + adt = DataType[new_adt_name] + self.set_nodeattr("accDataType", adt.name) + # for no-activation nodes, output dt = acc dt + self.set_nodeattr("outputDataType", adt.name) + return DataType[self.get_nodeattr("accDataType")] + + def make_weight_file(self, weights, weight_file_mode, weight_file_name): + """Produce a file containing given weights in appropriate format for this + layer. This file can be used for either synthesis or run-time reconfig + of weights. + + Arguments: + * weights : numpy array with weights to be put into the file + * weight_file_mode : one of {hls_header, decoupled_verilog_dat, + decoupled_runtime} + * weight_file_name : filename for the weight file to be generated + """ + # convert weights into hlslib-compatible format + weight_tensor = self.get_hls_compatible_weight_tensor(weights) + export_wdt = self.get_weight_datatype() + if "decoupled" in weight_file_mode: + # create a weight stream for various flavors of decoupled mode: + # transpose weight tensor from (1, PE, WMEM, SIMD) to (1, WMEM, PE, SIMD) + weight_tensor_unflipped = np.transpose(weight_tensor, (0, 2, 1, 3)) + # reverse SIMD flip for saving weights in .npy + weight_tensor_simd_flipped = np.flip(weight_tensor_unflipped, axis=-1) + # PE flip for saving weights in .dat + weight_tensor_pe_flipped = np.flip(weight_tensor_unflipped, axis=-2) + # reshape weight tensor (simd_flipped and pe_flipped) to desired shape + pe = self.get_nodeattr("PE") + simd = self.get_nodeattr("SIMD") + # simd_flipped + weight_tensor_simd_flipped = weight_tensor_simd_flipped.reshape( + 1, -1, pe * simd + ) + weight_tensor_simd_flipped = weight_tensor_simd_flipped.copy() + # flipped + weight_tensor_pe_flipped = weight_tensor_pe_flipped.reshape( + 1, -1, pe * simd + ) + weight_tensor_pe_flipped = weight_tensor_pe_flipped.copy() + if weight_file_mode == "decoupled_verilog_dat": + # convert weight values into hexstring + weight_width = self.get_weightstream_width() + # pad to nearest 4 bits to get hex strings + weight_width_padded = roundup_to_integer_multiple(weight_width, 4) + weight_tensor_pe_flipped = pack_innermost_dim_as_hex_string( + weight_tensor_pe_flipped, export_wdt, weight_width_padded, prefix="" + ) + # add zeroes to pad out file to 1024 entries + weight_stream = weight_tensor_pe_flipped.flatten() + weight_stream = weight_stream.copy() + with open(weight_file_name, "w") as f: + for val in weight_stream: + f.write(val + "\n") + elif weight_file_mode == "decoupled_runtime": + # memstream axi-lite interface will map each mem line to + # one or multiple 32-bit words + weight_width = self.get_weightstream_width() + words_per_memwidth = 2 ** math.ceil(math.log2(weight_width / 32)) + if words_per_memwidth < 1: + words_per_memwidth = 1 + weight_width_padded = words_per_memwidth * 32 + # first, pack and ensure padding to 32 bits + weight_tensor_pe_flipped = pack_innermost_dim_as_hex_string( + weight_tensor_pe_flipped, export_wdt, weight_width_padded, prefix="" + ) + weight_stream = weight_tensor_pe_flipped.flatten() + weight_stream = weight_stream.copy() + with open(weight_file_name, "w") as f: + for val in weight_stream: + # split into groups of 8 hex digits (= 32 bits) + words_32b = textwrap.wrap(val, 8) + words_32b.reverse() + for word_32b in words_32b: + f.write(word_32b + "\n") + else: + raise Exception("Unknown/unsupported weight_file_mode") + + else: + raise Exception("Unknown/unsupported weight_file_mode") + + def generate_params(self, model, path): + mem_mode = self.get_nodeattr("mem_mode") + code_gen_dir = path + # weights, if not external + weights = model.get_initializer(self.onnx_node.input[1]) + if mem_mode == "decoupled": + weight_filename_sim = "{}/weights.npy".format(code_gen_dir) + # save decoupled weights for cppsim + self.make_weight_file(weights, "decoupled_npy", weight_filename_sim) + # also save weights as Verilog .dat file + # note that we provide two different .dat files, one for synth + # and one for synthesis. this is because URAM-based weights always + # need zero weights for synthesis, otherwise they get inferred + # as BRAM + weight_filename_rtl_synth = "{}/memblock_synth_0.dat".format( + code_gen_dir + ) + weight_filename_rtl_sim = "{}/memblock_sim_0.dat".format(code_gen_dir) + # sim weights are always the true weights + self.make_weight_file( + weights, "decoupled_verilog_dat", weight_filename_rtl_sim + ) + ram_style = self.get_nodeattr("ram_style") + if ram_style == "ultra": + # UltraRAM must have no memory initializer, or only zeroes + # otherwise BRAM will be inferred instead of URAM + # as a workaround we provide a zero-weight init here + synth_weights = np.zeros_like(weights, dtype=np.float32) + else: + synth_weights = weights + self.make_weight_file( + synth_weights, "decoupled_verilog_dat", weight_filename_rtl_synth + ) + else: + raise Exception( + """Please set mem_mode to "decoupled", + currently no other parameter value is supported!""" + ) + + def execute_node(self, context, graph): + mode = self.get_nodeattr("exec_mode") + mem_mode = self.get_nodeattr("mem_mode") + node = self.onnx_node + + # TODO ensure codegen dir exists + if mode == "cppsim": + raise Exception( + "cppsim not possible for RTL MVAU, please set exec_mode to rtlsim" + ) + elif mode == "rtlsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + # create a npy file fore each input of the node (in_ind is input index) + in_ind = 0 + for inputs in node.input: + # it is assumed that the first input of the node is the data input + # the second input are the weights + # the third input are the thresholds + if in_ind == 0: + assert ( + str(context[inputs].dtype) == "float32" + ), """Input datatype is + not float32 as expected.""" + expected_inp_shape = self.get_folded_input_shape() + reshaped_input = context[inputs].reshape(expected_inp_shape) + export_idt = self.get_input_datatype() + # make copy before saving the array + reshaped_input = reshaped_input.copy() + np.save( + os.path.join(code_gen_dir, "input_{}.npy".format(in_ind)), + reshaped_input, + ) + elif in_ind > 2: + raise Exception("Unexpected input found for MatrixVectorActivation") + in_ind += 1 + + if mode == "rtlsim": + sim = self.get_rtlsim() + nbits = self.get_instream_width() + inp = npy_to_rtlsim_input( + "{}/input_0.npy".format(code_gen_dir), export_idt, nbits + ) + super().reset_rtlsim(sim) + super().toggle_clk(sim) + if mem_mode == "external" or mem_mode == "decoupled": + wnbits = self.get_weightstream_width() + export_wdt = self.get_weight_datatype() + wei = npy_to_rtlsim_input( + "{}/weights.npy".format(code_gen_dir), export_wdt, wnbits + ) + num_w_reps = np.prod(self.get_nodeattr("numInputVectors")) + io_dict = { + "inputs": {"in0": inp, "weights": wei * num_w_reps}, + "outputs": {"out": []}, + } + self.rtlsim_multi_io(sim, io_dict) + output = io_dict["outputs"]["out"] + else: + output = self.rtlsim(sim, inp) + odt = self.get_output_datatype() + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + rtlsim_output_to_npy( + output, out_npy_path, odt, out_shape, packed_bits, target_bits + ) + # load and reshape output + output = np.load(out_npy_path) + oshape = self.get_normal_output_shape() + output = np.asarray([output], dtype=np.float32).reshape(*oshape) + context[node.output[0]] = output + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + def code_generation_ipgen(self, model, fpgapart, clk): + """Normally: Generates C++ code and tcl script for IP generation. + Here: Generates (System-)Verilog code for IP generation.""" + self.generate_hdl() + + def ipgen_singlenode_code(self): + """Normally: Builds the bash script for IP generation.""" + pass + + def code_generation_cppsim(self, model): + """Normally: Generates C++ code for simulation (cppsim).""" + pass + + def compile_singlenode_code(self): + pass + + def global_includes(self): + pass + + def defines(self, var): + pass + + def read_npy_data(self): + pass + + def strm_decl(self): + pass + + def docompute(self): + pass + + def dataoutstrm(self): + pass + + def save_as_npy(self): + pass + + def blackboxfunction(self): + pass + + def pragmas(self): + pass + + def code_generation_ipi(self): + cmd = [] + # add streamer if needed + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode == "decoupled": + runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1 + if self.get_nodeattr("ram_style") == "ultra": + assert ( + runtime_writable == 1 + ), "Layer with URAM weights must have runtime_writeable_weights=1" + node_name = self.onnx_node.name + sname = self.hls_sname() + # create a hierarchy for this layer, with the same port names + clk_name = self.get_verilog_top_module_intf_names()["clk"][0] + rst_name = self.get_verilog_top_module_intf_names()["rst"][0] + dout_name = self.get_verilog_top_module_intf_names()["m_axis"][0][0] + din_name = self.get_verilog_top_module_intf_names()["s_axis"][0][0] + cmd.append("create_bd_cell -type hier %s" % node_name) + cmd.append("create_bd_pin -dir I -type clk /%s/%s" % (node_name, clk_name)) + cmd.append("create_bd_pin -dir I -type rst /%s/%s" % (node_name, rst_name)) + cmd.append( + "create_bd_intf_pin -mode Master " + "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" + % (node_name, dout_name) + ) + cmd.append( + "create_bd_intf_pin -mode Slave " + "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, din_name) + ) + # instantiate the hls ip + cmd.append( + "create_bd_cell -type ip -vlnv %s /%s/%s" + % (self.get_nodeattr("ip_vlnv"), node_name, node_name) + ) + # instantiate a streamer and connect it to the HLS IP + strm_vlnv = "xilinx.com:user:memstream:1.0" + strm_inst = node_name + "_wstrm" + cmd.append( + "create_bd_cell -type ip -vlnv %s /%s/%s" + % (strm_vlnv, node_name, strm_inst) + ) + cmd.append( + "set_property -dict [list " + "CONFIG.NSTREAMS {1} " + "CONFIG.MEM_DEPTH {%d} " + "CONFIG.MEM_WIDTH {%d} " + "CONFIG.MEM_INIT {%s} " + "CONFIG.RAM_STYLE {%s} " + "CONFIG.STRM0_DEPTH {%d} " + "CONFIG.STRM0_WIDTH {%d} " + "CONFIG.STRM0_OFFSET {0} " + "] [get_bd_cells /%s/%s]" + % ( + self.calc_wmem(), + self.get_weightstream_width_padded(), + self.get_nodeattr("code_gen_dir_ipgen") + "/", + self.get_nodeattr("ram_style"), + self.calc_wmem(), + self.get_weightstream_width_padded(), + node_name, + strm_inst, + ) + ) + cmd.append( + "connect_bd_intf_net [get_bd_intf_pins %s/%s/m_axis_0] " + "[get_bd_intf_pins %s/%s/weights_%s]" + % (node_name, strm_inst, node_name, node_name, sname) + ) + cmd.append( + "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/aresetn]" + % (node_name, rst_name, node_name, strm_inst) + ) + cmd.append( + "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/aclk]" + % (node_name, clk_name, node_name, strm_inst) + ) + cmd.append( + "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/%s]" + % (node_name, rst_name, node_name, node_name, rst_name) + ) + cmd.append( + "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/%s]" + % (node_name, clk_name, node_name, node_name, clk_name) + ) + cmd.append( + "connect_bd_intf_net [get_bd_intf_pins %s/%s] " + "[get_bd_intf_pins %s/%s/%s]" + % (node_name, din_name, node_name, node_name, din_name) + ) + cmd.append( + "connect_bd_intf_net [get_bd_intf_pins %s/%s] " + "[get_bd_intf_pins %s/%s/%s]" + % (node_name, dout_name, node_name, node_name, dout_name) + ) + if runtime_writable: + # expose axi lite interface for writeable weights + axilite_name = self.get_verilog_top_module_intf_names()["axilite"][0] + cmd.append( + "create_bd_intf_pin -mode Slave " + "-vlnv xilinx.com:interface:aximm_rtl:1.0 /%s/%s" + % (node_name, axilite_name) + ) + cmd.append( + "connect_bd_intf_net [get_bd_intf_pins %s/%s] " + "[get_bd_intf_pins %s/%s/%s]" + % (node_name, axilite_name, node_name, strm_inst, axilite_name) + ) + # TODO calculate and pass in segment size here + cmd.append("assign_bd_address") + cmd.append("save_bd_design") + elif mem_mode == "const" or mem_mode == "external": + # base class impl sufficient for const/external modes + return super().code_generation_ipi() + else: + raise Exception("Unrecognized mem_mode for MatrixVectorActivation") + return cmd + + def get_verilog_top_module_intf_names(self): + intf_names = super().get_verilog_top_module_intf_names() + mem_mode = self.get_nodeattr("mem_mode") + sname = self.hls_sname() + if mem_mode == "external": + intf_names["s_axis"].append( + ("weights_" + sname, self.get_weightstream_width_padded()) + ) + if mem_mode == "decoupled": + # only expose axilite interface if attribute is set + runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1 + if runtime_writable: + intf_names["axilite"] = ["s_axilite"] + return intf_names + + def get_op_and_param_counts(self): + in_features = self.get_nodeattr("MW") + out_features = self.get_nodeattr("MH") + weight_bits = self.get_weight_datatype().bitwidth() + inp_bits = self.get_input_datatype().bitwidth() + num_inp_vec = self.get_nodeattr("numInputVectors") + num_repetitions = int(np.prod(num_inp_vec)) + mac_count = in_features * out_features * num_repetitions + # cannonicalize op type: highest bitwidth operand first s.t. + # e.g. mac_8bx4b and mac_4bx8b don't appear as two different op types + bw1 = min(inp_bits, weight_bits) + bw2 = max(inp_bits, weight_bits) + mac_op_type = "op_mac_%dbx%db" % (bw1, bw2) + weight_param_type = "param_weight_%db" % (weight_bits) + weight_count = in_features * out_features + ret_dict = {mac_op_type: mac_count, weight_param_type: weight_count} + if self.get_nodeattr("noActivation") == 0: + tdt = DataType[self.get_nodeattr("accDataType")] + thres_bits = tdt.bitwidth() + thres_param_type = "param_threshold_%db" % (thres_bits) + thres_count = out_features + ret_dict[thres_param_type] = thres_count + return ret_dict + + def derive_characteristic_fxns(self, period): + n_inps = np.prod(self.get_folded_input_shape()[:-1]) + io_dict = { + "inputs": { + "in0": [0 for i in range(n_inps)], + }, + "outputs": {"out": []}, + } + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode in ["decoupled", "external"]: + n_weight_inps = self.calc_wmem() + num_w_reps = np.prod(self.get_nodeattr("numInputVectors")) + io_dict["inputs"]["weights"] = [ + 0 for i in range(num_w_reps * n_weight_inps) + ] + super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict) + + def generate_hdl(self): +#TODO: add distinction between (PE=MH or PE=1) and where MH dimension is folded + template_path, code_gen_dict = self.prepare_codegen_default() + + # add general parameters to dictionary + code_gen_dict["$TOP_MODULE_NAME$"] = [self.get_verilog_top_module_name()] + # save top module name so we can refer to it after this node has been renamed + # (e.g. by GiveUniqueNodeNames(prefix) during MakeZynqProject) + self.set_nodeattr("gen_top_module", self.get_verilog_top_module_name()) +#TODO: currently only ram_style=auto is supported + ram_style = self.get_nodeattr("ram_style") + if ram_style == "auto": + continue + else: + raise Exception("Unrecognized ram_style for MatrixVectorActivation") + + # apply code generation to templates + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + with open(template_path, "r") as f: + template = f.read() + for key in code_gen_dict: + # transform list into long string separated by '\n' + code_gen_line = "\n".join(code_gen_dict[key]) + template = template.replace(key, code_gen_line) + template_wrapper = template_wrapper.replace(key, code_gen_line) + with open( + os.path.join( + code_gen_dir, self.get_nodeattr("gen_top_module") + "_impl.sv" + ), + "w", + ) as f: + f.write(template) + with open( + os.path.join( + code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v" + ), + "w", + ) as f: + f.write(template_wrapper) + + # set ipgen_path and ip_path so that HLS-Synth transformation + # and stich_ip transformation do not complain + self.set_nodeattr("ipgen_path", code_gen_dir) + self.set_nodeattr("ip_path", code_gen_dir) + + def prepare_codegen_default(self): + # TODO: Differentiate between PE folding and fully unrolled along MH dimension + template_path = ( + os.environ["FINN_ROOT"] + "/finn-rtllib/mvau/dsp58_mvau_template.vhdl" + ) + code_gen_dict = {} + + code_gen_dict["$PE$"] = self.get_nodeattr("PE") + code_gen_dict["$SIMD$"] = self.get_nodeattr("SIMD") + code_gen_dict["$MW$"] = self.get_nodeattr("MW") + code_gen_dict["$MH$"] = self.get_nodeattr("MH") + code_gen_dict["$ACTIVATION_WIDTH$"] = self.get_input_datatype(0).bitwidth() + code_gen_dict["$WEIGHT_WIDTH$"] = self.get_input_datatype(1).bitwidth() + code_gen_dict["$ACCU_WIDTH_BA$"] = self.get_output_datatype().bitwidth() + + return template_path, code_gen_dict + From afab9cd6543b4fe1f612c329074d30d59706ac08 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 6 Apr 2023 12:34:01 +0100 Subject: [PATCH 002/123] [rtl custom op]: initial implementation of mvu_8sx9 --- finn-rtllib/mvu/mvu_8sx9.sv | 284 ++++++++++++++++++++++++++++++++++++ 1 file changed, 284 insertions(+) create mode 100644 finn-rtllib/mvu/mvu_8sx9.sv diff --git a/finn-rtllib/mvu/mvu_8sx9.sv b/finn-rtllib/mvu/mvu_8sx9.sv new file mode 100644 index 0000000000..c992990d9f --- /dev/null +++ b/finn-rtllib/mvu/mvu_8sx9.sv @@ -0,0 +1,284 @@ +/****************************************************************************** + * Copyright (C) 2022, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Matrix Vector Unit (MVU) core compute kernel utilizing DSP58. + *****************************************************************************/ + +module mvu_8sx9 #( + int unsigned PE, + int unsigned SIMD, + int unsigned ACTIVATION_WIDTH, + int unsigned WEIGHT_WIDTH, + bit SIGNED_ACTIVATIONS = 0, + int unsigned SEGMENTLEN = 0 // Default to 0 (which implies a single segment) + ) + ( + input logic clk, + input logic rst, + input logic en, + input logic last, + input logic zero, + input logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] a, + input logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] w, + output logic vld, + output logic [PE-1:0][57:0] p + ); + +//-------------------- Declare global signals --------------------\\ +localparam int unsigned CHAINLEN = (SIMD+2)/3; +localparam int unsigned SEGLEN = SEGMENTLEN == 0 ? CHAINLEN : SEGMENTLEN; // Additional constant to default a SEGMENTLEN of '0' to the DSP-chain length +uwire [26:0] a_in_i [CHAINLEN]; +uwire [23:0] b_in_i [PE][CHAINLEN]; +uwire [57:0] pcout [PE][CHAINLEN]; + +//-------------------- Shift register for opmode select signal --------------------\\ +localparam int unsigned MAX_PIPELINE_STAGES = (CHAINLEN + SEGLEN-1)/SEGLEN; // >=1 (== number of pipeline registers + 1 (A/B inputs always have 1 register)) +logic L [0:1+MAX_PIPELINE_STAGES] = '{default: 0}; // After MAX_PIPELINE_STAGES (== number of pipeline stages for input data), we have 3 additional cycles latency (A/B reg, Mreg, Preg). Thus, we add +2 (since OPMODE is buffered by 1 cycle in the DSP fabric) + +always_ff @(posedge clk) begin + if(rst) L <= '{default: 0}; + else if(en) L <= { L[1:1+MAX_PIPELINE_STAGES], last }; +end +assign vld = L[0]; + +//-------------------- Shift register for ZERO flag --------------------\\ +logic Z [0:MAX_PIPELINE_STAGES-2] = '{default:0}; // We need MAX_PIPELINE_STAGES-1 pipeline stages (note: INMODE is buffered inside DSP fabric) + +if (MAX_PIPELINE_STAGES > 1) begin : genZreg + always_ff @(posedge clk) begin + if (rst) Z <= '{default: 0}; + else if(en) begin + Z[0] <= zero; + if (MAX_PIPELINE_STAGES > 2) Z[1:MAX_PIPELINE_STAGES-1] <= Z[0:MAX_PIPELINE_STAGES-2]; + end + end +end; + +//-------------------- Buffer for input activations --------------------\\ +localparam int unsigned PAD_BITS_ACT = 9 - ACTIVATION_WIDTH; +typedef logic [2:0][ACTIVATION_WIDTH-1:0] a_buffer_t; + +for (genvar i=0; i1 ? TOTAL_PREGS-1 : 0; + + if (EXTERNAL_PREGS > 0) begin : genExternalPregAct + a_buffer_t A [0:EXTERNAL_PREGS-1]; + always_ff @(posedge clk) begin + if (rst) A <= '{default: 0}; + else if(en) begin + A[EXTERNAL_PREGS-1] <= a[3*i +: 3]; + if (EXTERNAL_PREGS > 1) A[0:EXTERNAL_PREGS-2] <= A[1:EXTERNAL_PREGS-1]; + end + end + assign a_in_i[i][26:0] = SIGNED_ACTIVATIONS ? { {PAD_BITS_ACT{A[0][2][ACTIVATION_WIDTH-1]}}, A[0][2], {PAD_BITS_ACT{A[0][1][ACTIVATION_WIDTH-1]}}, A[0][1], {PAD_BITS_ACT{A[0][0][ACTIVATION_WIDTH-1]}}, A[0][0]} + : { {PAD_BITS_ACT{1'b0}}, A[0][2], {PAD_BITS_ACT{1'b0}}, A[0][1], {PAD_BITS_ACT{1'b0}}, A[0][0]} ; + end : genExternalPregAct + else begin : genInpDSPAct + assign a_in_i[i][26:0] = SIGNED_ACTIVATIONS ? { {PAD_BITS_ACT{a[3*i+2][ACTIVATION_WIDTH-1]}}, a[3*i+2], {PAD_BITS_ACT{a[3*i+1][ACTIVATION_WIDTH-1]}}, a[3*i+1], {PAD_BITS_ACT{a[3*i][ACTIVATION_WIDTH-1]}}, a[3*i]} + : { {PAD_BITS_ACT{1'b0}}, a[3*i+2], {PAD_BITS_ACT{1'b0}}, a[3*i+1], {PAD_BITS_ACT{1'b0}}, a[3*i]} ; + end : genInpDSPAct + +end : genActSIMD + +//-------------------- Buffer for weights --------------------\\ +localparam int unsigned PAD_BITS_WEIGHT = 8 - WEIGHT_WIDTH; +typedef logic [2:0][WEIGHT_WIDTH-1:0] b_buffer_t; + +for (genvar j=0; j1 ? TOTAL_PREGS-1 : 0; + + if (EXTERNAL_PREGS > 0) begin : genExternalPregWeight + b_buffer_t B [0:PE-1][0:EXTERNAL_PREGS-1]; + always_ff @(posedge clk) begin + if (rst) B <= '{default: 0}; + else if (en) begin + B[j][EXTERNAL_PREGS-1] <= w[j][3*i +: 3]; + if (EXTERNAL_PREGS > 1) B[j][0:EXTERNAL_PREGS-2] <= B[j][1:EXTERNAL_PREGS-1]; + end + end + assign b_in_i[j][i][23:0] = { {PAD_BITS_WEIGHT{B[j][0][2][WEIGHT_WIDTH-1]}}, B[j][0][2], {PAD_BITS_WEIGHT{B[j][0][1][WEIGHT_WIDTH-1]}}, B[j][0][1], {PAD_BITS_WEIGHT{B[j][0][0][WEIGHT_WIDTH-1]}}, B[j][0][0] }; + end : genExternalPregWeight + else begin : genInpDSPWeight + assign b_in_i[j][i][23:0] = { {PAD_BITS_WEIGHT{w[j][3*i+2][WEIGHT_WIDTH-1]}}, w[j][3*i+2], {PAD_BITS_WEIGHT{w[j][3*i+1][WEIGHT_WIDTH-1]}}, w[j][3*i+1], {PAD_BITS_WEIGHT{w[j][3*i][WEIGHT_WIDTH-1]}}, w[j][3*i] }; + end : genInpDSPWeight + end : genWeightSIMD + +end : genWeightPE + +//-------------------- Instantiate PE x CHAINLEN DSPs --------------------\\ +for (genvar j=0; j0 ? 2 : 1; // 1 : 0 + localparam bit PREG = (i+1)%SEGLEN==0 || i == CHAINLEN-1; + localparam bit FIRST = i == 0; + localparam bit LAST = i == CHAINLEN-1; + uwire [57:0] pp; + + if (LAST) begin : genPOUT + assign p[j] = pp; + end + + DSP58 #( + // Feature Control Attributes: Data Path Selection + .AMULTSEL("A"), // Selects A input to multiplier (A, AD) + .A_INPUT("DIRECT"), // Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port) + .BMULTSEL("B"), // Selects B input to multiplier (AD, B) + .B_INPUT("DIRECT"), // Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port) + .DSP_MODE("INT8"), // Configures DSP to a particular mode of operation. Set to INT24 for + // legacy mode. + .PREADDINSEL("A"), // Selects input to pre-adder (A, B) + .RND(58'h000000000000000), // Rounding Constant + .USE_MULT("MULTIPLY"), // Select multiplier usage (DYNAMIC, MULTIPLY, NONE) + .USE_SIMD("ONE58"), // SIMD selection (FOUR12, ONE58, TWO24) + .USE_WIDEXOR("FALSE"), // Use the Wide XOR function (FALSE, TRUE) + .XORSIMD("XOR24_34_58_116"), // Mode of operation for the Wide XOR (XOR12_22, XOR24_34_58_116) + // Pattern Detector Attributes: Pattern Detection Configuration + .AUTORESET_PATDET("NO_RESET"), // NO_RESET, RESET_MATCH, RESET_NOT_MATCH + .AUTORESET_PRIORITY("RESET"), // Priority of AUTORESET vs. CEP (CEP, RESET). + .MASK(58'h0ffffffffffffff), // 58-bit mask value for pattern detect (1=ignore) + .PATTERN(58'h000000000000000), // 58-bit pattern match for pattern detect + .SEL_MASK("MASK"), // C, MASK, ROUNDING_MODE1, ROUNDING_MODE2 + .SEL_PATTERN("PATTERN"), // Select pattern value (C, PATTERN) + .USE_PATTERN_DETECT("NO_PATDET"), // Enable pattern detect (NO_PATDET, PATDET) + // Programmable Inversion Attributes: Specifies built-in programmable inversion on specific pins + .IS_ALUMODE_INVERTED(4'b0000), // Optional inversion for ALUMODE + .IS_CARRYIN_INVERTED(1'b0), // Optional inversion for CARRYIN + .IS_CLK_INVERTED(1'b0), // Optional inversion for CLK + .IS_INMODE_INVERTED(5'b00000), // Optional inversion for INMODE + .IS_NEGATE_INVERTED(3'b000), // Optional inversion for NEGATE + .IS_OPMODE_INVERTED({ LAST ? 2'b01 : 2'b00 , // W: LAST ? (L[1] ? 0 : P) : 0 + FIRST ? 3'b000 : 3'b001, // Z: FIRST ? 0 : PCIN + 2'b01, // Y : M + 2'b01 // X: M + }), // Optional inversion for OPMODE + .IS_RSTALLCARRYIN_INVERTED(1'b0), // Optional inversion for RSTALLCARRYIN + .IS_RSTALUMODE_INVERTED(1'b0), // Optional inversion for RSTALUMODE + .IS_RSTA_INVERTED(1'b0), // Optional inversion for RSTA + .IS_RSTB_INVERTED(1'b0), // Optional inversion for RSTB + .IS_RSTCTRL_INVERTED(1'b0), // Optional inversion for STCONJUGATE_A + .IS_RSTC_INVERTED(1'b0), // Optional inversion for RSTC + .IS_RSTD_INVERTED(1'b0), // Optional inversion for RSTD + .IS_RSTINMODE_INVERTED(1'b0), // Optional inversion for RSTINMODE + .IS_RSTM_INVERTED(1'b0), // Optional inversion for RSTM + .IS_RSTP_INVERTED(1'b0), // Optional inversion for RSTP + // Register Control Attributes: Pipeline Register Configuration + .ACASCREG(INTERNAL_PREGS), // Number of pipeline stages between A/ACIN and ACOUT (0-2) + .ADREG(0), // Pipeline stages for pre-adder (0-1) + .ALUMODEREG(0), // Pipeline stages for ALUMODE (0-1) + .AREG(INTERNAL_PREGS), // Pipeline stages for A (0-2) + .BCASCREG(INTERNAL_PREGS), // Number of pipeline stages between B/BCIN and BCOUT (0-2) + .BREG(INTERNAL_PREGS), // Pipeline stages for B (0-2) + .CARRYINREG(0), // Pipeline stages for CARRYIN (0-1) + .CARRYINSELREG(0), // Pipeline stages for CARRYINSEL (0-1) + .CREG(0), // Pipeline stages for C (0-1) + .DREG(0), // Pipeline stages for D (0-1) + .INMODEREG(1), // Pipeline stages for INMODE (0-1) + .MREG(1), // Multiplier pipeline stages (0-1) + .OPMODEREG(1), // Pipeline stages for OPMODE (0-1) + .PREG(PREG), // Number of pipeline stages for P (0-1) + .RESET_MODE("SYNC") // Selection of synchronous or asynchronous reset. (ASYNC, SYNC). + ) + DSP58_inst ( + // Cascade outputs: Cascade Ports + .ACOUT(), // 34-bit output: A port cascade + .BCOUT(), // 24-bit output: B cascade + .CARRYCASCOUT(), // 1-bit output: Cascade carry + .MULTSIGNOUT(), // 1-bit output: Multiplier sign cascade + .PCOUT(pcout[j][i]), // 58-bit output: Cascade output + // Control outputs: Control Inputs/Status Bits + .OVERFLOW(), // 1-bit output: Overflow in add/acc + .PATTERNBDETECT(), // 1-bit output: Pattern bar detect + .PATTERNDETECT(), // 1-bit output: Pattern detect + .UNDERFLOW(), // 1-bit output: Underflow in add/acc + // Data outputs: Data Ports + .CARRYOUT(), // 4-bit output: Carry + .P(pp), // 58-bit output: Primary data + .XOROUT(), // 8-bit output: XOR data + // Cascade inputs: Cascade Ports + .ACIN('x), // 34-bit input: A cascade data + .BCIN('x), // 24-bit input: B cascade + .CARRYCASCIN('x), // 1-bit input: Cascade carry + .MULTSIGNIN('x), // 1-bit input: Multiplier sign cascade + .PCIN(FIRST ? 'x : pcout[j][i-1]), // 58-bit input: P cascade + // Control inputs: Control Inputs/Status Bits + .ALUMODE(4'h0), // 4-bit input: ALU control + .CARRYINSEL('0), // 3-bit input: Carry select + .CLK(clk), // 1-bit input: Clock + .INMODE({ + INTERNAL_PREGS==2 ? 1'b0 : 1'b1, + 2'b00, + TOTAL_PREGS > 0 ? Z[TOTAL_PREGS-1] : zero, + INTERNAL_PREGS==2 ? 1'b0 : 1'b1 + }), // 5-bit input: INMODE control + .NEGATE('0), // 3-bit input: Negates the input of the multiplier + .OPMODE({ + LAST ? {1'b0, L[1]} : 2'b00, + 7'b000_0000 + }), // 9-bit input: Operation mode + // Data inputs: Data Ports + .A({ 7'bx, a_in_i[i] }), // 34-bit input: A data + .B(b_in_i[j][i]), // 24-bit input: B data + .C('x), // 58-bit input: C data + .CARRYIN('0), // 1-bit input: Carry-in + .D('x), // 27-bit input: D data + // Reset/Clock Enable inputs: Reset/Clock Enable Inputs + .ASYNC_RST('0), // 1-bit input: Asynchronous reset for all registers. + .CEA1(en), // 1-bit input: Clock enable for 1st stage AREG + .CEA2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage AREG + .CEAD('0), // 1-bit input: Clock enable for ADREG + .CEALUMODE('0), // 1-bit input: Clock enable for ALUMODE + .CEB1(en), // 1-bit input: Clock enable for 1st stage BREG + .CEB2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage BREG + .CEC('0), // 1-bit input: Clock enable for CREG + .CECARRYIN('0), // 1-bit input: Clock enable for CARRYINREG + .CECTRL(en), // 1-bit input: Clock enable for OPMODEREG and CARRYINSELREG + .CED('0), // 1-bit input: Clock enable for DREG + .CEINMODE(en), // 1-bit input: Clock enable for INMODEREG + .CEM(en), // 1-bit input: Clock enable for MREG + .CEP(PREG && en), // 1-bit input: Clock enable for PREG + .RSTA(rst), // 1-bit input: Reset for AREG + .RSTALLCARRYIN('0), // 1-bit input: Reset for CARRYINREG + .RSTALUMODE('0), // 1-bit input: Reset for ALUMODEREG + .RSTB(rst), // 1-bit input: Reset for BREG + .RSTC('0), // 1-bit input: Reset for CREG + .RSTCTRL(rst), // 1-bit input: Reset for OPMODEREG and CARRYINSELREG + .RSTD('0), // 1-bit input: Reset for DREG and ADREG + .RSTINMODE(rst), // 1-bit input: Reset for INMODE register + .RSTM(rst), // 1-bit input: Reset for MREG + .RSTP(PREG && rst) // 1-bit input: Reset for PREG + ); + end : genDSPChain +end : genDSPPE + +endmodule From a94fc3bb0759ecd4b9af212d1629236894a1b520 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 6 Apr 2023 12:34:22 +0100 Subject: [PATCH 003/123] [rtl custom op]: testbench for mvu_8sx9 --- finn-rtllib/mvu/mvu_8sx9_tb.sv | 165 +++++++++++++++++++++++++++++++++ 1 file changed, 165 insertions(+) create mode 100644 finn-rtllib/mvu/mvu_8sx9_tb.sv diff --git a/finn-rtllib/mvu/mvu_8sx9_tb.sv b/finn-rtllib/mvu/mvu_8sx9_tb.sv new file mode 100644 index 0000000000..ea3ecbbd70 --- /dev/null +++ b/finn-rtllib/mvu/mvu_8sx9_tb.sv @@ -0,0 +1,165 @@ +/****************************************************************************** + * Copyright (C) 2022, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Testbench for MVU core compute kernel. + *****************************************************************************/ + +module mvu_8sx9_tb(); + + //-------------------- Simulation parameters --------------------\\ + // Matrix & parallelism config + localparam int unsigned MH = 256; + localparam int unsigned PE = 16; + localparam int unsigned MW = 600; + localparam int unsigned SIMD = 60; + localparam int unsigned SEGMENTLEN = 4; + // Bit-width config + localparam int unsigned ACTIVATION_WIDTH = 8; + localparam int unsigned WEIGHT_WIDTH = 4; + localparam bit SIGNED_ACTIVATIONS = 1; + // Simulation constants + localparam int unsigned NF = MH/PE; + localparam int unsigned SF = MW/SIMD; + localparam int unsigned NUM_OF_DSP = SIMD/3; + + typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t; + typedef activation_t activation_vector_t[SF]; + + function activation_vector_t init_ACTIVATIONS; + automatic activation_vector_t res; + std::randomize(res); + return res; + endfunction : init_ACTIVATIONS + + typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t; + typedef weight_t weight_matrix_t[NF][SF]; + + function weight_matrix_t init_WEIGHTS; + automatic weight_matrix_t res; + std::randomize(res); + return res; + endfunction : init_WEIGHTS; + + typedef logic signed [PE-1:0][57:0] output_t; + typedef output_t output_vector_t [NF]; + + function output_vector_t check_output(activation_vector_t a, weight_matrix_t w); + automatic output_vector_t res = '{default: 0}; + for (int j = 0; j 1) && !rst; + end + + // Compare computed output against golden output when vld flag is raised by DUT + always_ff @(posedge clk iff (vld && en)) begin + foreach(p[i]) begin + assert ($signed(p[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(p[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); + else begin + $error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(p[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); + $stop; + end + end + NF_CNT += 1; + end + + // Instantiate DUT + mvu_8sx9 #( + .PE(PE), + .SIMD(SIMD), + .WEIGHT_WIDTH(WEIGHT_WIDTH), + .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), + .ACTIVATION_WIDTH(ACTIVATION_WIDTH), + .SEGMENTLEN(SEGMENTLEN) + ) + dut ( + .clk, .rst, .en, .last, .zero, .a, .w, .vld, .p + ); + +endmodule From 98f9accb40bed3445215e15d30398e09948e0b9f Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 6 Apr 2023 12:35:30 +0100 Subject: [PATCH 004/123] [rtl custom op]: initial implementation of flow control component for mvu_8sx9 --- finn-rtllib/mvu/mvu_8sx9_axi.sv | 179 ++++++++++++++++++++++++++++++++ 1 file changed, 179 insertions(+) create mode 100644 finn-rtllib/mvu/mvu_8sx9_axi.sv diff --git a/finn-rtllib/mvu/mvu_8sx9_axi.sv b/finn-rtllib/mvu/mvu_8sx9_axi.sv new file mode 100644 index 0000000000..8765c50a26 --- /dev/null +++ b/finn-rtllib/mvu/mvu_8sx9_axi.sv @@ -0,0 +1,179 @@ +/****************************************************************************** + * Copyright (C) 2022, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Matrix Vector Unit (MVU) AXI-lite interface wrapper. + *****************************************************************************/ + +module mvu_8sx9_axi #( + int unsigned MW, + int unsigned MH, + int unsigned PE, + int unsigned SIMD, + int unsigned ACTIVATION_WIDTH, + int unsigned WEIGHT_WIDTH, + int unsigned ACCU_WIDTH, + bit SIGNED_ACTIVATIONS = 0, + int unsigned SEGMENTLEN = 0, + parameter RAM_STYLE = "auto", + + localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8, + localparam int unsigned INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8, + localparam int unsigned WEIGHT_STREAM_WIDTH = PE*SIMD*WEIGHT_WIDTH, + localparam int unsigned INPUT_STREAM_WIDTH = SIMD*ACTIVATION_WIDTH, + localparam int unsigned SF = MW/SIMD, + localparam int unsigned NF = MH/PE, + localparam int unsigned OUTPUT_LANES = PE, + localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8 +) +( + // Global Control + input logic ap_clk, + input logic ap_rst_n, + + // Weight Stream + input logic [WEIGHT_STREAM_WIDTH_BA-1:0] s_axis_weights_tdata, + input logic s_axis_weights_tvalid, + output logic s_axis_weights_tready, + + // Input Stream + input logic [INPUT_STREAM_WIDTH_BA-1:0] s_axis_input_tdata, + input logic s_axis_input_tvalid, + output logic s_axis_input_tready, + + // Output Stream + output logic [OUTPUT_STREAM_WIDTH_BA-1:0] m_axis_output_tdata, + output logic m_axis_output_tvalid, + input logic m_axis_output_tready +); + +//-------------------- Parameter sanity checks --------------------\\ + initial begin + if (MW % SIMD != 0) begin + $error("Matrix width (%0d) is not a multiple of SIMD (%0d).", MW, SIMD); + $finish; + end + if (MH % PE != 0) begin + $error("Matrix height (%0d) is not a multiple of PE (%0d).", MH, PE); + $finish; + end + if (ACTIVATION_WIDTH > 9) begin + $error("Activation width of %0d-bits exceeds maximum of 9-bits", ACTIVATION_WIDTH); + $finish; + end + if (WEIGHT_WIDTH > 8) begin + $error("Weight width of %0d-bits exceeds maximum of 8-bits", WEIGHT_WIDTH); + $finish; + end + if (SIGNED_ACTIVATIONS == 0 && ACTIVATION_WIDTH==9) begin + $error("Activation width of %0d-bits exceeds maximum of 8-bits for unsigned numbers", ACTIVATION_WIDTH); + $finish; + end + if (SEGMENTLEN == 0) begin + $warning("Segment length of %0d defaults to chain length", SEGMENTLEN); + end + if (SEGMENTLEN > (SIMD+2)/3) begin + $error("Segment length of %0d exceeds chain length of %0d", SEGMENTLEN, (SIMD+2)/3); + $finish; + end + end + + uwire clk = ap_clk; + uwire rst = !ap_rst_n; + + typedef logic [INPUT_STREAM_WIDTH-1 : 0] mvauin_t; + + uwire mvauin_t amvau; + uwire alast; + uwire afin; + uwire avld; + uwire ardy; + + replay_buffer #(.LEN(SF), .REP(NF), .W($bits(mvauin_t)), .RAM_STYLE(RAM_STYLE)) activation_replay ( + .clk, .rst, + .ivld(s_axis_input_tvalid), .irdy(s_axis_input_tready), .idat(mvauin_t'(s_axis_input_tdata)), + .ovld(avld), .ordy(ardy), .odat(amvau), .olast(alast), .ofin(afin) + ); + + //-------------------- Input control --------------------\\ + uwire en; + uwire istb = avld && s_axis_weights_tvalid; + assign ardy = en && s_axis_weights_tvalid; + assign s_axis_weights_tready = en && avld; + + //-------------------- Core MVU --------------------\\ + uwire ovld; + uwire [PE-1:0][57:0] odat; + typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t; + mvu_8sx9 #(.PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), + .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN)) core ( + .clk, .rst, .en, + .last(alast), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau), + .vld(ovld), .p(odat) + ); + + //-------------------- Output register slice --------------------\\ + struct { + logic vld; + logic [PE-1:0][ACCU_WIDTH-1:0] dat; + } A = '{ vld: 0, default: 'x}; + + assign en = !A.vld || !ovld; + + uwire b_load; + always_ff @(posedge clk) begin + if(rst) A <= '{ vld: 0, default: 'x }; + else if(!A.vld || b_load) begin + A.vld <= ovld && en; + for(int unsigned i = 0; i < PE; i++) begin + // CR-1148862: + // A.dat[i] <= odat[i]; + automatic logic [57:0] v = odat[i]; + A.dat[i] <= v[ACCU_WIDTH-1:0]; + end + end + end + + struct { + logic vld; + logic [PE-1:0][ACCU_WIDTH-1:0] dat; + } B = '{ vld: 0, default: 'x}; + + assign b_load = !B.vld || m_axis_output_tready; + always_ff @(posedge clk) begin + if(rst) B <= '{ default: 'x }; + else begin + if(b_load) B <= '{ vld: A.vld, dat: A.dat}; + end + end + + assign m_axis_output_tvalid = B.vld; + assign m_axis_output_tdata = B.dat; + +endmodule \ No newline at end of file From 96925a929877ce084466438128678250b09784a9 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 6 Apr 2023 12:36:00 +0100 Subject: [PATCH 005/123] [rtl custom op]: implementation of replay buffer for mvu --- finn-rtllib/mvu/replay_buffer.sv | 109 +++++++++++++++++++++++++++++++ 1 file changed, 109 insertions(+) create mode 100644 finn-rtllib/mvu/replay_buffer.sv diff --git a/finn-rtllib/mvu/replay_buffer.sv b/finn-rtllib/mvu/replay_buffer.sv new file mode 100644 index 0000000000..685ac03137 --- /dev/null +++ b/finn-rtllib/mvu/replay_buffer.sv @@ -0,0 +1,109 @@ +/****************************************************************************** + * Copyright (C) 2022, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Replay buffer for counted sequences on an AXI-lite stream. + * @author Thomas B. Preußer + *****************************************************************************/ + +module replay_buffer #( + int unsigned LEN, // Sequence length + int unsigned REP, // Sequence replay count + int unsigned W, // Data width + parameter RAM_STYLE = "auto" // ram style for buffer {block, distributed, ultra, auto} +)( + input logic clk, + input logic rst, + + input logic [W-1:0] idat, + input logic ivld, + output logic irdy, + + output logic [W-1:0] odat, + output logic olast, + output logic ofin, + output logic ovld, + input logic ordy +); + + typedef logic [$clog2(REP)+$clog2(LEN)-1:0] count_t; + count_t Count = 0; + uwire done_len = ((LEN-1) & ~Count[$clog2(LEN)-1:0]) == 0; + uwire done_rep; + uwire done_all = done_len && done_rep; + + uwire shift; + uwire clr = rst || (done_all && shift); + always_ff @(posedge clk) begin + if(clr) Count <= 0; + else if(shift) Count <= Count + ((REP > 1) && done_len? 2**$clog2(LEN)-LEN+1 : 1); + end + + typedef logic [W-1:0] data_t; + uwire data_t rdat; + uwire first_rep; + if(REP == 1) begin + assign done_rep = 1; + assign first_rep = 1; + assign rdat = 'x; + end + else begin + assign done_rep = ((REP-1) & ~Count[$left(Count):$clog2(LEN)]) == 0; + + logic FirstRep = 1; + always_ff @(posedge clk) begin + if(clr) FirstRep <= 1; + else if(shift) FirstRep <= FirstRep && !done_len; + end + assign first_rep = FirstRep; + + (* RAM_STYLE = RAM_STYLE *) + data_t Buf[LEN]; + if(LEN == 1) begin : genTrivial + always_ff @(posedge clk) begin + if(shift && FirstRep) Buf[0] <= idat; + end + end : genTrivial + else begin : genShift + always_ff @(posedge clk) begin + if(shift) Buf <= { odat, Buf[0:LEN-2] }; + end + end : genShift + + assign rdat = Buf[LEN-1]; + end + + assign irdy = ordy && first_rep; + assign odat = first_rep? idat : rdat; + assign olast = done_len; + assign ofin = done_all; + assign ovld = first_rep? ivld : 1; + assign shift = ovld && ordy; + +endmodule : replay_buffer \ No newline at end of file From a3d11567468899bbcf33c83b509c26f908a807a3 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 6 Apr 2023 12:37:16 +0100 Subject: [PATCH 006/123] [rtl custom op]: testbench for mvu_8sx9_axi (including axi_wrapper & compute kernel) --- finn-rtllib/mvu/mvu_8sx9_axi_tb.sv | 208 +++++++++++++++++++++++++++++ 1 file changed, 208 insertions(+) create mode 100644 finn-rtllib/mvu/mvu_8sx9_axi_tb.sv diff --git a/finn-rtllib/mvu/mvu_8sx9_axi_tb.sv b/finn-rtllib/mvu/mvu_8sx9_axi_tb.sv new file mode 100644 index 0000000000..ea97e0708c --- /dev/null +++ b/finn-rtllib/mvu/mvu_8sx9_axi_tb.sv @@ -0,0 +1,208 @@ +/****************************************************************************** + * Copyright (C) 2022, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Testbench for MVU AXI-lite interface wrapper. + *****************************************************************************/ + +module mvu_8sx9_axi_tb(); + + //-------------------- Simulation parameters --------------------\\ + // Matrix & parallelism config + localparam int unsigned MW = 600; + localparam int unsigned MH = 256; + localparam int unsigned SIMD = 60; + localparam int unsigned PE = 16; + localparam int unsigned SEGMENTLEN = 4; + // Bit-width config + localparam int unsigned ACTIVATION_WIDTH = 8; + localparam int unsigned WEIGHT_WIDTH = 4; + localparam int unsigned ACCU_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW); + localparam bit SIGNED_ACTIVATIONS = 1; + // Simulation constants + localparam int unsigned NF = MH/PE; + localparam int unsigned SF = MW/SIMD; + localparam int unsigned NUM_OF_DSP = SIMD/3; + localparam int unsigned WEIGHT_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8*8; + localparam int unsigned ACTIVATION_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8*8; + localparam int unsigned WEIGHT_WIDTH_BA_DELTA = WEIGHT_WIDTH_BA - PE*SIMD*WEIGHT_WIDTH; + localparam int unsigned ACTIVATION_WIDTH_BA_DELTA = ACTIVATION_WIDTH_BA - SIMD*ACTIVATION_WIDTH; + localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8; + + // Generate clk and reset signal + logic clk = 0; + always #5ns clk = !clk; + + logic ap_rst_n = 0; + initial begin + repeat(16) @(posedge clk); + ap_rst_n <= 1; + end + + uwire ap_clk = clk; + + // Generate activations + typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t; + typedef activation_t activation_vector_t[SF]; + + function activation_vector_t init_ACTIVATIONS; + automatic activation_vector_t res; + std::randomize(res); + return res; + endfunction : init_ACTIVATIONS + + activation_vector_t ACTIVATIONS = init_ACTIVATIONS(); + + struct { + activation_t dat; + logic vld; + logic rdy; + } activations; + + initial begin + activations.vld = 0; + activations.dat = '1; // Since ('X AND 0) will result in 'X in simulation, which would be propagated through the DSP chain + @(posedge clk iff ap_rst_n); + + for (int i=0; i 1; + @(posedge clk); + end while (!(activations.vld === 1 && activations.rdy === 1)); + end + + activations.vld <= 0; + activations.dat <= 'x; + end + + // Generate weights + typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t; + typedef weight_t weight_matrix_t[NF][SF]; + + function weight_matrix_t init_WEIGHTS; + automatic weight_matrix_t res; + std::randomize(res); + return res; + endfunction : init_WEIGHTS; + + weight_matrix_t WEIGHTS = init_WEIGHTS(); + + struct { + weight_t dat; + logic vld; + logic rdy; + } weights; + + initial begin + weights.vld = 0; + weights.dat = '1; // Since ('X AND 0) will result in 'X in simulation, which would be propagated through the DSP chain + @(posedge clk iff ap_rst_n); + + weights.vld <= 1; + for (int i=0; i= 1; + @(posedge clk iff ap_rst_n); + end while (!(outputs.rdy === 1 && outputs.vld === 1)); + + // Compare produced outputs against golden outputs + foreach(outputs.dat[i]) begin + assert ($signed(outputs.dat[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); + else begin + $error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); + $stop; + end + end + + NF_CNT += 1; + end + + $finish; + end + + // Instantiate DUT + mvu_8sx9_axi #( + .MW(MW), + .MH(MH), + .PE(PE), + .SIMD(SIMD), + .ACTIVATION_WIDTH(ACTIVATION_WIDTH), + .WEIGHT_WIDTH(WEIGHT_WIDTH), + .ACCU_WIDTH(ACCU_WIDTH), + .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), + .SEGMENTLEN(SEGMENTLEN) + ) + dut ( + .ap_clk, .ap_rst_n, .s_axis_weights_tdata({ {WEIGHT_WIDTH_BA_DELTA{1'b0}}, weights.dat }), .s_axis_weights_tvalid(weights.vld), + .s_axis_weights_tready(weights.rdy), .s_axis_input_tdata({ {ACTIVATION_WIDTH_BA_DELTA{1'b0}}, activations.dat }), .s_axis_input_tvalid(activations.vld), + .s_axis_input_tready(activations.rdy), .m_axis_output_tdata(outputs.dat), .m_axis_output_tvalid(outputs.vld), + .m_axis_output_tready(outputs.rdy) + ); + +endmodule From 2aea664b2260a4ea759909d0a3168b5f62b114a2 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 6 Apr 2023 12:37:55 +0100 Subject: [PATCH 007/123] [rtl custom op]: initial implementation of verilog wrapper for mvu_8sx9_axi --- finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v | 90 ++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) create mode 100644 finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v diff --git a/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v b/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v new file mode 100644 index 0000000000..ff3779d211 --- /dev/null +++ b/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v @@ -0,0 +1,90 @@ +/****************************************************************************** + * Copyright (C) 2022, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Verilog AXI-lite wrapper for MVU. + *****************************************************************************/ + +module $MODULE_NAME_AXI_WRAPPER$ #( + parameter MW = $MW$, + parameter MH = $MH$, + parameter PE = $PE$, + parameter SIMD = $SIMD$, + parameter ACTIVATION_WIDTH = $ACTIVATION_WIDTH$, + parameter WEIGHT_WIDTH = $WEIGHT_WIDTH$, + parameter ACCU_WIDTH = $ACCU_WIDTH$, + parameter SIGNED_ACTIVATIONS = $SIGNED_ACTIVATIONS$, + parameter SEGMENTLEN = $SEGMENTLEN$, + parameter RAM_STYLE = $IBUF_RAM_STYLE$, + + // Safely deducible parameters + parameter WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8, + parameter INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8, + parameter OUTPUT_LANES = PE, + parameter OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8 +)( + // Global Control + input logic ap_clk, + input logic ap_rst_n, + + // Weight Stream + input logic [WEIGHT_STREAM_WIDTH_BA-1:0] s_axis_weights_tdata, + input logic s_axis_weights_tvalid, + output logic s_axis_weights_tready, + + // Input Stream + input logic [INPUT_STREAM_WIDTH_BA-1:0] s_axis_input_tdata, + input logic s_axis_input_tvalid, + output logic s_axis_input_tready, + + // Output Stream + output logic [OUTPUT_STREAM_WIDTH_BA-1:0] m_axis_output_tdata, + output logic m_axis_output_tvalid, + input logic m_axis_output_tready +); + +mvu_8sx9_axi #( + .MW(MW), .MH(MH), .PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), + .WEIGHT_WIDTH(WEIGHT_WIDTH), .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), + .SEGMENTLEN(SEGMENTLEN), .RAM_STYLE(RAM_STYLE) + ) inst ( + .ap_clk(ap_clk), + .ap_rst_n(ap_rst_n), + .s_axis_weights_tdata(s_axis_weights_tdata), + .s_axis_weights_tvalid(s_axis_weights_tvalid), + .s_axis_weights_tready(s_axis_weights_tready), + .s_axis_input_tdata(s_axis_input_tdata), + .s_axis_input_tvalid(s_axis_input_tvalid), + .s_axis_input_tready(s_axis_input_tready), + .m_axis_output_tdata(m_axis_output_tdata), + .m_axis_output_tvalid(m_axis_output_tvalid), + .m_axis_output_tready(m_axis_output_tready) +) + +endmodule : mvau_8sx9_axi_wrapper \ No newline at end of file From 8b57849bb47c3119b177e78dcbaa48954f69b811 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Tue, 11 Apr 2023 15:50:24 +0100 Subject: [PATCH 008/123] [rtl mvu]: fix tab indentation --- finn-rtllib/mvu/mvu_8sx9.sv | 424 ++++++++++++------------- finn-rtllib/mvu/mvu_8sx9_axi.sv | 32 +- finn-rtllib/mvu/mvu_8sx9_axi_tb.sv | 342 ++++++++++---------- finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v | 26 +- finn-rtllib/mvu/mvu_8sx9_tb.sv | 258 +++++++-------- 5 files changed, 541 insertions(+), 541 deletions(-) diff --git a/finn-rtllib/mvu/mvu_8sx9.sv b/finn-rtllib/mvu/mvu_8sx9.sv index c992990d9f..d082d4fb2e 100644 --- a/finn-rtllib/mvu/mvu_8sx9.sv +++ b/finn-rtllib/mvu/mvu_8sx9.sv @@ -52,233 +52,233 @@ module mvu_8sx9 #( ); //-------------------- Declare global signals --------------------\\ -localparam int unsigned CHAINLEN = (SIMD+2)/3; -localparam int unsigned SEGLEN = SEGMENTLEN == 0 ? CHAINLEN : SEGMENTLEN; // Additional constant to default a SEGMENTLEN of '0' to the DSP-chain length -uwire [26:0] a_in_i [CHAINLEN]; -uwire [23:0] b_in_i [PE][CHAINLEN]; -uwire [57:0] pcout [PE][CHAINLEN]; + localparam int unsigned CHAINLEN = (SIMD+2)/3; + localparam int unsigned SEGLEN = SEGMENTLEN == 0 ? CHAINLEN : SEGMENTLEN; // Additional constant to default a SEGMENTLEN of '0' to the DSP-chain length + uwire [26:0] a_in_i [CHAINLEN]; + uwire [23:0] b_in_i [PE][CHAINLEN]; + uwire [57:0] pcout [PE][CHAINLEN]; //-------------------- Shift register for opmode select signal --------------------\\ -localparam int unsigned MAX_PIPELINE_STAGES = (CHAINLEN + SEGLEN-1)/SEGLEN; // >=1 (== number of pipeline registers + 1 (A/B inputs always have 1 register)) -logic L [0:1+MAX_PIPELINE_STAGES] = '{default: 0}; // After MAX_PIPELINE_STAGES (== number of pipeline stages for input data), we have 3 additional cycles latency (A/B reg, Mreg, Preg). Thus, we add +2 (since OPMODE is buffered by 1 cycle in the DSP fabric) + localparam int unsigned MAX_PIPELINE_STAGES = (CHAINLEN + SEGLEN-1)/SEGLEN; // >=1 (== number of pipeline registers + 1 (A/B inputs always have 1 register)) + logic L [0:1+MAX_PIPELINE_STAGES] = '{default: 0}; // After MAX_PIPELINE_STAGES (== number of pipeline stages for input data), we have 3 additional cycles latency (A/B reg, Mreg, Preg). Thus, we add +2 (since OPMODE is buffered by 1 cycle in the DSP fabric) -always_ff @(posedge clk) begin - if(rst) L <= '{default: 0}; - else if(en) L <= { L[1:1+MAX_PIPELINE_STAGES], last }; -end -assign vld = L[0]; + always_ff @(posedge clk) begin + if(rst) L <= '{default: 0}; + else if(en) L <= { L[1:1+MAX_PIPELINE_STAGES], last }; + end + assign vld = L[0]; //-------------------- Shift register for ZERO flag --------------------\\ -logic Z [0:MAX_PIPELINE_STAGES-2] = '{default:0}; // We need MAX_PIPELINE_STAGES-1 pipeline stages (note: INMODE is buffered inside DSP fabric) + logic Z [0:MAX_PIPELINE_STAGES-2] = '{default:0}; // We need MAX_PIPELINE_STAGES-1 pipeline stages (note: INMODE is buffered inside DSP fabric) -if (MAX_PIPELINE_STAGES > 1) begin : genZreg - always_ff @(posedge clk) begin - if (rst) Z <= '{default: 0}; - else if(en) begin - Z[0] <= zero; - if (MAX_PIPELINE_STAGES > 2) Z[1:MAX_PIPELINE_STAGES-1] <= Z[0:MAX_PIPELINE_STAGES-2]; - end - end -end; + if (MAX_PIPELINE_STAGES > 1) begin : genZreg + always_ff @(posedge clk) begin + if (rst) Z <= '{default: 0}; + else if(en) begin + Z[0] <= zero; + if (MAX_PIPELINE_STAGES > 2) Z[1:MAX_PIPELINE_STAGES-1] <= Z[0:MAX_PIPELINE_STAGES-2]; + end + end + end; //-------------------- Buffer for input activations --------------------\\ -localparam int unsigned PAD_BITS_ACT = 9 - ACTIVATION_WIDTH; -typedef logic [2:0][ACTIVATION_WIDTH-1:0] a_buffer_t; + localparam int unsigned PAD_BITS_ACT = 9 - ACTIVATION_WIDTH; + typedef logic [2:0][ACTIVATION_WIDTH-1:0] a_buffer_t; -for (genvar i=0; i1 ? TOTAL_PREGS-1 : 0; - - if (EXTERNAL_PREGS > 0) begin : genExternalPregAct - a_buffer_t A [0:EXTERNAL_PREGS-1]; - always_ff @(posedge clk) begin - if (rst) A <= '{default: 0}; - else if(en) begin - A[EXTERNAL_PREGS-1] <= a[3*i +: 3]; - if (EXTERNAL_PREGS > 1) A[0:EXTERNAL_PREGS-2] <= A[1:EXTERNAL_PREGS-1]; - end - end - assign a_in_i[i][26:0] = SIGNED_ACTIVATIONS ? { {PAD_BITS_ACT{A[0][2][ACTIVATION_WIDTH-1]}}, A[0][2], {PAD_BITS_ACT{A[0][1][ACTIVATION_WIDTH-1]}}, A[0][1], {PAD_BITS_ACT{A[0][0][ACTIVATION_WIDTH-1]}}, A[0][0]} - : { {PAD_BITS_ACT{1'b0}}, A[0][2], {PAD_BITS_ACT{1'b0}}, A[0][1], {PAD_BITS_ACT{1'b0}}, A[0][0]} ; - end : genExternalPregAct - else begin : genInpDSPAct - assign a_in_i[i][26:0] = SIGNED_ACTIVATIONS ? { {PAD_BITS_ACT{a[3*i+2][ACTIVATION_WIDTH-1]}}, a[3*i+2], {PAD_BITS_ACT{a[3*i+1][ACTIVATION_WIDTH-1]}}, a[3*i+1], {PAD_BITS_ACT{a[3*i][ACTIVATION_WIDTH-1]}}, a[3*i]} - : { {PAD_BITS_ACT{1'b0}}, a[3*i+2], {PAD_BITS_ACT{1'b0}}, a[3*i+1], {PAD_BITS_ACT{1'b0}}, a[3*i]} ; - end : genInpDSPAct + for (genvar i=0; i1 ? TOTAL_PREGS-1 : 0; -end : genActSIMD + if (EXTERNAL_PREGS > 0) begin : genExternalPregAct + a_buffer_t A [0:EXTERNAL_PREGS-1]; + always_ff @(posedge clk) begin + if (rst) A <= '{default: 0}; + else if(en) begin + A[EXTERNAL_PREGS-1] <= a[3*i +: 3]; + if (EXTERNAL_PREGS > 1) A[0:EXTERNAL_PREGS-2] <= A[1:EXTERNAL_PREGS-1]; + end + end + assign a_in_i[i][26:0] = SIGNED_ACTIVATIONS ? { {PAD_BITS_ACT{A[0][2][ACTIVATION_WIDTH-1]}}, A[0][2], {PAD_BITS_ACT{A[0][1][ACTIVATION_WIDTH-1]}}, A[0][1], {PAD_BITS_ACT{A[0][0][ACTIVATION_WIDTH-1]}}, A[0][0]} + : { {PAD_BITS_ACT{1'b0}}, A[0][2], {PAD_BITS_ACT{1'b0}}, A[0][1], {PAD_BITS_ACT{1'b0}}, A[0][0]} ; + end : genExternalPregAct + else begin : genInpDSPAct + assign a_in_i[i][26:0] = SIGNED_ACTIVATIONS ? { {PAD_BITS_ACT{a[3*i+2][ACTIVATION_WIDTH-1]}}, a[3*i+2], {PAD_BITS_ACT{a[3*i+1][ACTIVATION_WIDTH-1]}}, a[3*i+1], {PAD_BITS_ACT{a[3*i][ACTIVATION_WIDTH-1]}}, a[3*i]} + : { {PAD_BITS_ACT{1'b0}}, a[3*i+2], {PAD_BITS_ACT{1'b0}}, a[3*i+1], {PAD_BITS_ACT{1'b0}}, a[3*i]} ; + end : genInpDSPAct + + end : genActSIMD //-------------------- Buffer for weights --------------------\\ -localparam int unsigned PAD_BITS_WEIGHT = 8 - WEIGHT_WIDTH; -typedef logic [2:0][WEIGHT_WIDTH-1:0] b_buffer_t; + localparam int unsigned PAD_BITS_WEIGHT = 8 - WEIGHT_WIDTH; + typedef logic [2:0][WEIGHT_WIDTH-1:0] b_buffer_t; -for (genvar j=0; j1 ? TOTAL_PREGS-1 : 0; - - if (EXTERNAL_PREGS > 0) begin : genExternalPregWeight - b_buffer_t B [0:PE-1][0:EXTERNAL_PREGS-1]; - always_ff @(posedge clk) begin - if (rst) B <= '{default: 0}; - else if (en) begin - B[j][EXTERNAL_PREGS-1] <= w[j][3*i +: 3]; - if (EXTERNAL_PREGS > 1) B[j][0:EXTERNAL_PREGS-2] <= B[j][1:EXTERNAL_PREGS-1]; - end - end - assign b_in_i[j][i][23:0] = { {PAD_BITS_WEIGHT{B[j][0][2][WEIGHT_WIDTH-1]}}, B[j][0][2], {PAD_BITS_WEIGHT{B[j][0][1][WEIGHT_WIDTH-1]}}, B[j][0][1], {PAD_BITS_WEIGHT{B[j][0][0][WEIGHT_WIDTH-1]}}, B[j][0][0] }; - end : genExternalPregWeight - else begin : genInpDSPWeight - assign b_in_i[j][i][23:0] = { {PAD_BITS_WEIGHT{w[j][3*i+2][WEIGHT_WIDTH-1]}}, w[j][3*i+2], {PAD_BITS_WEIGHT{w[j][3*i+1][WEIGHT_WIDTH-1]}}, w[j][3*i+1], {PAD_BITS_WEIGHT{w[j][3*i][WEIGHT_WIDTH-1]}}, w[j][3*i] }; - end : genInpDSPWeight - end : genWeightSIMD + for (genvar j=0; j1 ? TOTAL_PREGS-1 : 0; -end : genWeightPE + if (EXTERNAL_PREGS > 0) begin : genExternalPregWeight + b_buffer_t B [0:PE-1][0:EXTERNAL_PREGS-1]; + always_ff @(posedge clk) begin + if (rst) B <= '{default: 0}; + else if (en) begin + B[j][EXTERNAL_PREGS-1] <= w[j][3*i +: 3]; + if (EXTERNAL_PREGS > 1) B[j][0:EXTERNAL_PREGS-2] <= B[j][1:EXTERNAL_PREGS-1]; + end + end + assign b_in_i[j][i][23:0] = { {PAD_BITS_WEIGHT{B[j][0][2][WEIGHT_WIDTH-1]}}, B[j][0][2], {PAD_BITS_WEIGHT{B[j][0][1][WEIGHT_WIDTH-1]}}, B[j][0][1], {PAD_BITS_WEIGHT{B[j][0][0][WEIGHT_WIDTH-1]}}, B[j][0][0] }; + end : genExternalPregWeight + else begin : genInpDSPWeight + assign b_in_i[j][i][23:0] = { {PAD_BITS_WEIGHT{w[j][3*i+2][WEIGHT_WIDTH-1]}}, w[j][3*i+2], {PAD_BITS_WEIGHT{w[j][3*i+1][WEIGHT_WIDTH-1]}}, w[j][3*i+1], {PAD_BITS_WEIGHT{w[j][3*i][WEIGHT_WIDTH-1]}}, w[j][3*i] }; + end : genInpDSPWeight + end : genWeightSIMD + + end : genWeightPE //-------------------- Instantiate PE x CHAINLEN DSPs --------------------\\ -for (genvar j=0; j0 ? 2 : 1; // 1 : 0 - localparam bit PREG = (i+1)%SEGLEN==0 || i == CHAINLEN-1; - localparam bit FIRST = i == 0; - localparam bit LAST = i == CHAINLEN-1; - uwire [57:0] pp; - - if (LAST) begin : genPOUT - assign p[j] = pp; - end - - DSP58 #( - // Feature Control Attributes: Data Path Selection - .AMULTSEL("A"), // Selects A input to multiplier (A, AD) - .A_INPUT("DIRECT"), // Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port) - .BMULTSEL("B"), // Selects B input to multiplier (AD, B) - .B_INPUT("DIRECT"), // Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port) - .DSP_MODE("INT8"), // Configures DSP to a particular mode of operation. Set to INT24 for - // legacy mode. - .PREADDINSEL("A"), // Selects input to pre-adder (A, B) - .RND(58'h000000000000000), // Rounding Constant - .USE_MULT("MULTIPLY"), // Select multiplier usage (DYNAMIC, MULTIPLY, NONE) - .USE_SIMD("ONE58"), // SIMD selection (FOUR12, ONE58, TWO24) - .USE_WIDEXOR("FALSE"), // Use the Wide XOR function (FALSE, TRUE) - .XORSIMD("XOR24_34_58_116"), // Mode of operation for the Wide XOR (XOR12_22, XOR24_34_58_116) - // Pattern Detector Attributes: Pattern Detection Configuration - .AUTORESET_PATDET("NO_RESET"), // NO_RESET, RESET_MATCH, RESET_NOT_MATCH - .AUTORESET_PRIORITY("RESET"), // Priority of AUTORESET vs. CEP (CEP, RESET). - .MASK(58'h0ffffffffffffff), // 58-bit mask value for pattern detect (1=ignore) - .PATTERN(58'h000000000000000), // 58-bit pattern match for pattern detect - .SEL_MASK("MASK"), // C, MASK, ROUNDING_MODE1, ROUNDING_MODE2 - .SEL_PATTERN("PATTERN"), // Select pattern value (C, PATTERN) - .USE_PATTERN_DETECT("NO_PATDET"), // Enable pattern detect (NO_PATDET, PATDET) - // Programmable Inversion Attributes: Specifies built-in programmable inversion on specific pins - .IS_ALUMODE_INVERTED(4'b0000), // Optional inversion for ALUMODE - .IS_CARRYIN_INVERTED(1'b0), // Optional inversion for CARRYIN - .IS_CLK_INVERTED(1'b0), // Optional inversion for CLK - .IS_INMODE_INVERTED(5'b00000), // Optional inversion for INMODE - .IS_NEGATE_INVERTED(3'b000), // Optional inversion for NEGATE - .IS_OPMODE_INVERTED({ LAST ? 2'b01 : 2'b00 , // W: LAST ? (L[1] ? 0 : P) : 0 - FIRST ? 3'b000 : 3'b001, // Z: FIRST ? 0 : PCIN - 2'b01, // Y : M - 2'b01 // X: M - }), // Optional inversion for OPMODE - .IS_RSTALLCARRYIN_INVERTED(1'b0), // Optional inversion for RSTALLCARRYIN - .IS_RSTALUMODE_INVERTED(1'b0), // Optional inversion for RSTALUMODE - .IS_RSTA_INVERTED(1'b0), // Optional inversion for RSTA - .IS_RSTB_INVERTED(1'b0), // Optional inversion for RSTB - .IS_RSTCTRL_INVERTED(1'b0), // Optional inversion for STCONJUGATE_A - .IS_RSTC_INVERTED(1'b0), // Optional inversion for RSTC - .IS_RSTD_INVERTED(1'b0), // Optional inversion for RSTD - .IS_RSTINMODE_INVERTED(1'b0), // Optional inversion for RSTINMODE - .IS_RSTM_INVERTED(1'b0), // Optional inversion for RSTM - .IS_RSTP_INVERTED(1'b0), // Optional inversion for RSTP - // Register Control Attributes: Pipeline Register Configuration - .ACASCREG(INTERNAL_PREGS), // Number of pipeline stages between A/ACIN and ACOUT (0-2) - .ADREG(0), // Pipeline stages for pre-adder (0-1) - .ALUMODEREG(0), // Pipeline stages for ALUMODE (0-1) - .AREG(INTERNAL_PREGS), // Pipeline stages for A (0-2) - .BCASCREG(INTERNAL_PREGS), // Number of pipeline stages between B/BCIN and BCOUT (0-2) - .BREG(INTERNAL_PREGS), // Pipeline stages for B (0-2) - .CARRYINREG(0), // Pipeline stages for CARRYIN (0-1) - .CARRYINSELREG(0), // Pipeline stages for CARRYINSEL (0-1) - .CREG(0), // Pipeline stages for C (0-1) - .DREG(0), // Pipeline stages for D (0-1) - .INMODEREG(1), // Pipeline stages for INMODE (0-1) - .MREG(1), // Multiplier pipeline stages (0-1) - .OPMODEREG(1), // Pipeline stages for OPMODE (0-1) - .PREG(PREG), // Number of pipeline stages for P (0-1) - .RESET_MODE("SYNC") // Selection of synchronous or asynchronous reset. (ASYNC, SYNC). - ) - DSP58_inst ( - // Cascade outputs: Cascade Ports - .ACOUT(), // 34-bit output: A port cascade - .BCOUT(), // 24-bit output: B cascade - .CARRYCASCOUT(), // 1-bit output: Cascade carry - .MULTSIGNOUT(), // 1-bit output: Multiplier sign cascade - .PCOUT(pcout[j][i]), // 58-bit output: Cascade output - // Control outputs: Control Inputs/Status Bits - .OVERFLOW(), // 1-bit output: Overflow in add/acc - .PATTERNBDETECT(), // 1-bit output: Pattern bar detect - .PATTERNDETECT(), // 1-bit output: Pattern detect - .UNDERFLOW(), // 1-bit output: Underflow in add/acc - // Data outputs: Data Ports - .CARRYOUT(), // 4-bit output: Carry - .P(pp), // 58-bit output: Primary data - .XOROUT(), // 8-bit output: XOR data - // Cascade inputs: Cascade Ports - .ACIN('x), // 34-bit input: A cascade data - .BCIN('x), // 24-bit input: B cascade - .CARRYCASCIN('x), // 1-bit input: Cascade carry - .MULTSIGNIN('x), // 1-bit input: Multiplier sign cascade - .PCIN(FIRST ? 'x : pcout[j][i-1]), // 58-bit input: P cascade - // Control inputs: Control Inputs/Status Bits - .ALUMODE(4'h0), // 4-bit input: ALU control - .CARRYINSEL('0), // 3-bit input: Carry select - .CLK(clk), // 1-bit input: Clock - .INMODE({ - INTERNAL_PREGS==2 ? 1'b0 : 1'b1, - 2'b00, - TOTAL_PREGS > 0 ? Z[TOTAL_PREGS-1] : zero, - INTERNAL_PREGS==2 ? 1'b0 : 1'b1 - }), // 5-bit input: INMODE control - .NEGATE('0), // 3-bit input: Negates the input of the multiplier - .OPMODE({ - LAST ? {1'b0, L[1]} : 2'b00, - 7'b000_0000 - }), // 9-bit input: Operation mode - // Data inputs: Data Ports - .A({ 7'bx, a_in_i[i] }), // 34-bit input: A data - .B(b_in_i[j][i]), // 24-bit input: B data - .C('x), // 58-bit input: C data - .CARRYIN('0), // 1-bit input: Carry-in - .D('x), // 27-bit input: D data - // Reset/Clock Enable inputs: Reset/Clock Enable Inputs - .ASYNC_RST('0), // 1-bit input: Asynchronous reset for all registers. - .CEA1(en), // 1-bit input: Clock enable for 1st stage AREG - .CEA2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage AREG - .CEAD('0), // 1-bit input: Clock enable for ADREG - .CEALUMODE('0), // 1-bit input: Clock enable for ALUMODE - .CEB1(en), // 1-bit input: Clock enable for 1st stage BREG - .CEB2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage BREG - .CEC('0), // 1-bit input: Clock enable for CREG - .CECARRYIN('0), // 1-bit input: Clock enable for CARRYINREG - .CECTRL(en), // 1-bit input: Clock enable for OPMODEREG and CARRYINSELREG - .CED('0), // 1-bit input: Clock enable for DREG - .CEINMODE(en), // 1-bit input: Clock enable for INMODEREG - .CEM(en), // 1-bit input: Clock enable for MREG - .CEP(PREG && en), // 1-bit input: Clock enable for PREG - .RSTA(rst), // 1-bit input: Reset for AREG - .RSTALLCARRYIN('0), // 1-bit input: Reset for CARRYINREG - .RSTALUMODE('0), // 1-bit input: Reset for ALUMODEREG - .RSTB(rst), // 1-bit input: Reset for BREG - .RSTC('0), // 1-bit input: Reset for CREG - .RSTCTRL(rst), // 1-bit input: Reset for OPMODEREG and CARRYINSELREG - .RSTD('0), // 1-bit input: Reset for DREG and ADREG - .RSTINMODE(rst), // 1-bit input: Reset for INMODE register - .RSTM(rst), // 1-bit input: Reset for MREG - .RSTP(PREG && rst) // 1-bit input: Reset for PREG - ); - end : genDSPChain -end : genDSPPE + for (genvar j=0; j0 ? 2 : 1; // 1 : 0 + localparam bit PREG = (i+1)%SEGLEN==0 || i == CHAINLEN-1; + localparam bit FIRST = i == 0; + localparam bit LAST = i == CHAINLEN-1; + uwire [57:0] pp; + + if (LAST) begin : genPOUT + assign p[j] = pp; + end + + DSP58 #( + // Feature Control Attributes: Data Path Selection + .AMULTSEL("A"), // Selects A input to multiplier (A, AD) + .A_INPUT("DIRECT"), // Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port) + .BMULTSEL("B"), // Selects B input to multiplier (AD, B) + .B_INPUT("DIRECT"), // Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port) + .DSP_MODE("INT8"), // Configures DSP to a particular mode of operation. Set to INT24 for + // legacy mode. + .PREADDINSEL("A"), // Selects input to pre-adder (A, B) + .RND(58'h000000000000000), // Rounding Constant + .USE_MULT("MULTIPLY"), // Select multiplier usage (DYNAMIC, MULTIPLY, NONE) + .USE_SIMD("ONE58"), // SIMD selection (FOUR12, ONE58, TWO24) + .USE_WIDEXOR("FALSE"), // Use the Wide XOR function (FALSE, TRUE) + .XORSIMD("XOR24_34_58_116"), // Mode of operation for the Wide XOR (XOR12_22, XOR24_34_58_116) + // Pattern Detector Attributes: Pattern Detection Configuration + .AUTORESET_PATDET("NO_RESET"), // NO_RESET, RESET_MATCH, RESET_NOT_MATCH + .AUTORESET_PRIORITY("RESET"), // Priority of AUTORESET vs. CEP (CEP, RESET). + .MASK(58'h0ffffffffffffff), // 58-bit mask value for pattern detect (1=ignore) + .PATTERN(58'h000000000000000), // 58-bit pattern match for pattern detect + .SEL_MASK("MASK"), // C, MASK, ROUNDING_MODE1, ROUNDING_MODE2 + .SEL_PATTERN("PATTERN"), // Select pattern value (C, PATTERN) + .USE_PATTERN_DETECT("NO_PATDET"), // Enable pattern detect (NO_PATDET, PATDET) + // Programmable Inversion Attributes: Specifies built-in programmable inversion on specific pins + .IS_ALUMODE_INVERTED(4'b0000), // Optional inversion for ALUMODE + .IS_CARRYIN_INVERTED(1'b0), // Optional inversion for CARRYIN + .IS_CLK_INVERTED(1'b0), // Optional inversion for CLK + .IS_INMODE_INVERTED(5'b00000), // Optional inversion for INMODE + .IS_NEGATE_INVERTED(3'b000), // Optional inversion for NEGATE + .IS_OPMODE_INVERTED({ LAST ? 2'b01 : 2'b00 , // W: LAST ? (L[1] ? 0 : P) : 0 + FIRST ? 3'b000 : 3'b001, // Z: FIRST ? 0 : PCIN + 2'b01, // Y : M + 2'b01 // X: M + }), // Optional inversion for OPMODE + .IS_RSTALLCARRYIN_INVERTED(1'b0), // Optional inversion for RSTALLCARRYIN + .IS_RSTALUMODE_INVERTED(1'b0), // Optional inversion for RSTALUMODE + .IS_RSTA_INVERTED(1'b0), // Optional inversion for RSTA + .IS_RSTB_INVERTED(1'b0), // Optional inversion for RSTB + .IS_RSTCTRL_INVERTED(1'b0), // Optional inversion for STCONJUGATE_A + .IS_RSTC_INVERTED(1'b0), // Optional inversion for RSTC + .IS_RSTD_INVERTED(1'b0), // Optional inversion for RSTD + .IS_RSTINMODE_INVERTED(1'b0), // Optional inversion for RSTINMODE + .IS_RSTM_INVERTED(1'b0), // Optional inversion for RSTM + .IS_RSTP_INVERTED(1'b0), // Optional inversion for RSTP + // Register Control Attributes: Pipeline Register Configuration + .ACASCREG(INTERNAL_PREGS), // Number of pipeline stages between A/ACIN and ACOUT (0-2) + .ADREG(0), // Pipeline stages for pre-adder (0-1) + .ALUMODEREG(0), // Pipeline stages for ALUMODE (0-1) + .AREG(INTERNAL_PREGS), // Pipeline stages for A (0-2) + .BCASCREG(INTERNAL_PREGS), // Number of pipeline stages between B/BCIN and BCOUT (0-2) + .BREG(INTERNAL_PREGS), // Pipeline stages for B (0-2) + .CARRYINREG(0), // Pipeline stages for CARRYIN (0-1) + .CARRYINSELREG(0), // Pipeline stages for CARRYINSEL (0-1) + .CREG(0), // Pipeline stages for C (0-1) + .DREG(0), // Pipeline stages for D (0-1) + .INMODEREG(1), // Pipeline stages for INMODE (0-1) + .MREG(1), // Multiplier pipeline stages (0-1) + .OPMODEREG(1), // Pipeline stages for OPMODE (0-1) + .PREG(PREG), // Number of pipeline stages for P (0-1) + .RESET_MODE("SYNC") // Selection of synchronous or asynchronous reset. (ASYNC, SYNC). + ) + DSP58_inst ( + // Cascade outputs: Cascade Ports + .ACOUT(), // 34-bit output: A port cascade + .BCOUT(), // 24-bit output: B cascade + .CARRYCASCOUT(), // 1-bit output: Cascade carry + .MULTSIGNOUT(), // 1-bit output: Multiplier sign cascade + .PCOUT(pcout[j][i]), // 58-bit output: Cascade output + // Control outputs: Control Inputs/Status Bits + .OVERFLOW(), // 1-bit output: Overflow in add/acc + .PATTERNBDETECT(), // 1-bit output: Pattern bar detect + .PATTERNDETECT(), // 1-bit output: Pattern detect + .UNDERFLOW(), // 1-bit output: Underflow in add/acc + // Data outputs: Data Ports + .CARRYOUT(), // 4-bit output: Carry + .P(pp), // 58-bit output: Primary data + .XOROUT(), // 8-bit output: XOR data + // Cascade inputs: Cascade Ports + .ACIN('x), // 34-bit input: A cascade data + .BCIN('x), // 24-bit input: B cascade + .CARRYCASCIN('x), // 1-bit input: Cascade carry + .MULTSIGNIN('x), // 1-bit input: Multiplier sign cascade + .PCIN(FIRST ? 'x : pcout[j][i-1]), // 58-bit input: P cascade + // Control inputs: Control Inputs/Status Bits + .ALUMODE(4'h0), // 4-bit input: ALU control + .CARRYINSEL('0), // 3-bit input: Carry select + .CLK(clk), // 1-bit input: Clock + .INMODE({ + INTERNAL_PREGS==2 ? 1'b0 : 1'b1, + 2'b00, + TOTAL_PREGS > 0 ? Z[TOTAL_PREGS-1] : zero, + INTERNAL_PREGS==2 ? 1'b0 : 1'b1 + }), // 5-bit input: INMODE control + .NEGATE('0), // 3-bit input: Negates the input of the multiplier + .OPMODE({ + LAST ? {1'b0, L[1]} : 2'b00, + 7'b000_0000 + }), // 9-bit input: Operation mode + // Data inputs: Data Ports + .A({ 7'bx, a_in_i[i] }), // 34-bit input: A data + .B(b_in_i[j][i]), // 24-bit input: B data + .C('x), // 58-bit input: C data + .CARRYIN('0), // 1-bit input: Carry-in + .D('x), // 27-bit input: D data + // Reset/Clock Enable inputs: Reset/Clock Enable Inputs + .ASYNC_RST('0), // 1-bit input: Asynchronous reset for all registers. + .CEA1(en), // 1-bit input: Clock enable for 1st stage AREG + .CEA2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage AREG + .CEAD('0), // 1-bit input: Clock enable for ADREG + .CEALUMODE('0), // 1-bit input: Clock enable for ALUMODE + .CEB1(en), // 1-bit input: Clock enable for 1st stage BREG + .CEB2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage BREG + .CEC('0), // 1-bit input: Clock enable for CREG + .CECARRYIN('0), // 1-bit input: Clock enable for CARRYINREG + .CECTRL(en), // 1-bit input: Clock enable for OPMODEREG and CARRYINSELREG + .CED('0), // 1-bit input: Clock enable for DREG + .CEINMODE(en), // 1-bit input: Clock enable for INMODEREG + .CEM(en), // 1-bit input: Clock enable for MREG + .CEP(PREG && en), // 1-bit input: Clock enable for PREG + .RSTA(rst), // 1-bit input: Reset for AREG + .RSTALLCARRYIN('0), // 1-bit input: Reset for CARRYINREG + .RSTALUMODE('0), // 1-bit input: Reset for ALUMODEREG + .RSTB(rst), // 1-bit input: Reset for BREG + .RSTC('0), // 1-bit input: Reset for CREG + .RSTCTRL(rst), // 1-bit input: Reset for OPMODEREG and CARRYINSELREG + .RSTD('0), // 1-bit input: Reset for DREG and ADREG + .RSTINMODE(rst), // 1-bit input: Reset for INMODE register + .RSTM(rst), // 1-bit input: Reset for MREG + .RSTP(PREG && rst) // 1-bit input: Reset for PREG + ); + end : genDSPChain + end : genDSPPE endmodule diff --git a/finn-rtllib/mvu/mvu_8sx9_axi.sv b/finn-rtllib/mvu/mvu_8sx9_axi.sv index 8765c50a26..6c7eaeaeca 100644 --- a/finn-rtllib/mvu/mvu_8sx9_axi.sv +++ b/finn-rtllib/mvu/mvu_8sx9_axi.sv @@ -41,36 +41,36 @@ module mvu_8sx9_axi #( int unsigned ACCU_WIDTH, bit SIGNED_ACTIVATIONS = 0, int unsigned SEGMENTLEN = 0, - parameter RAM_STYLE = "auto", + parameter RAM_STYLE = "auto", localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8, localparam int unsigned INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8, - localparam int unsigned WEIGHT_STREAM_WIDTH = PE*SIMD*WEIGHT_WIDTH, - localparam int unsigned INPUT_STREAM_WIDTH = SIMD*ACTIVATION_WIDTH, + localparam int unsigned WEIGHT_STREAM_WIDTH = PE*SIMD*WEIGHT_WIDTH, + localparam int unsigned INPUT_STREAM_WIDTH = SIMD*ACTIVATION_WIDTH, localparam int unsigned SF = MW/SIMD, - localparam int unsigned NF = MH/PE, + localparam int unsigned NF = MH/PE, localparam int unsigned OUTPUT_LANES = PE, localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8 ) ( // Global Control - input logic ap_clk, - input logic ap_rst_n, + input logic ap_clk, + input logic ap_rst_n, // Weight Stream - input logic [WEIGHT_STREAM_WIDTH_BA-1:0] s_axis_weights_tdata, - input logic s_axis_weights_tvalid, + input logic [WEIGHT_STREAM_WIDTH_BA-1:0] s_axis_weights_tdata, + input logic s_axis_weights_tvalid, output logic s_axis_weights_tready, // Input Stream - input logic [INPUT_STREAM_WIDTH_BA-1:0] s_axis_input_tdata, - input logic s_axis_input_tvalid, + input logic [INPUT_STREAM_WIDTH_BA-1:0] s_axis_input_tdata, + input logic s_axis_input_tvalid, output logic s_axis_input_tready, // Output Stream output logic [OUTPUT_STREAM_WIDTH_BA-1:0] m_axis_output_tdata, output logic m_axis_output_tvalid, - input logic m_axis_output_tready + input logic m_axis_output_tready ); //-------------------- Parameter sanity checks --------------------\\ @@ -121,13 +121,13 @@ module mvu_8sx9_axi #( .ovld(avld), .ordy(ardy), .odat(amvau), .olast(alast), .ofin(afin) ); - //-------------------- Input control --------------------\\ +//-------------------- Input control --------------------\\ uwire en; uwire istb = avld && s_axis_weights_tvalid; assign ardy = en && s_axis_weights_tvalid; assign s_axis_weights_tready = en && avld; - //-------------------- Core MVU --------------------\\ +//-------------------- Core MVU --------------------\\ uwire ovld; uwire [PE-1:0][57:0] odat; typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t; @@ -138,7 +138,7 @@ module mvu_8sx9_axi #( .vld(ovld), .p(odat) ); - //-------------------- Output register slice --------------------\\ +//-------------------- Output register slice --------------------\\ struct { logic vld; logic [PE-1:0][ACCU_WIDTH-1:0] dat; @@ -148,7 +148,7 @@ module mvu_8sx9_axi #( uwire b_load; always_ff @(posedge clk) begin - if(rst) A <= '{ vld: 0, default: 'x }; + if(rst) A <= '{ vld: 0, default: 'x }; else if(!A.vld || b_load) begin A.vld <= ovld && en; for(int unsigned i = 0; i < PE; i++) begin @@ -169,7 +169,7 @@ module mvu_8sx9_axi #( always_ff @(posedge clk) begin if(rst) B <= '{ default: 'x }; else begin - if(b_load) B <= '{ vld: A.vld, dat: A.dat}; + if(b_load) B <= '{ vld: A.vld, dat: A.dat}; end end diff --git a/finn-rtllib/mvu/mvu_8sx9_axi_tb.sv b/finn-rtllib/mvu/mvu_8sx9_axi_tb.sv index ea97e0708c..70ffa096ef 100644 --- a/finn-rtllib/mvu/mvu_8sx9_axi_tb.sv +++ b/finn-rtllib/mvu/mvu_8sx9_axi_tb.sv @@ -33,176 +33,176 @@ module mvu_8sx9_axi_tb(); - //-------------------- Simulation parameters --------------------\\ - // Matrix & parallelism config - localparam int unsigned MW = 600; - localparam int unsigned MH = 256; - localparam int unsigned SIMD = 60; - localparam int unsigned PE = 16; - localparam int unsigned SEGMENTLEN = 4; - // Bit-width config - localparam int unsigned ACTIVATION_WIDTH = 8; - localparam int unsigned WEIGHT_WIDTH = 4; - localparam int unsigned ACCU_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW); - localparam bit SIGNED_ACTIVATIONS = 1; - // Simulation constants - localparam int unsigned NF = MH/PE; - localparam int unsigned SF = MW/SIMD; - localparam int unsigned NUM_OF_DSP = SIMD/3; - localparam int unsigned WEIGHT_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8*8; - localparam int unsigned ACTIVATION_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8*8; - localparam int unsigned WEIGHT_WIDTH_BA_DELTA = WEIGHT_WIDTH_BA - PE*SIMD*WEIGHT_WIDTH; - localparam int unsigned ACTIVATION_WIDTH_BA_DELTA = ACTIVATION_WIDTH_BA - SIMD*ACTIVATION_WIDTH; - localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8; - - // Generate clk and reset signal - logic clk = 0; - always #5ns clk = !clk; - - logic ap_rst_n = 0; - initial begin - repeat(16) @(posedge clk); - ap_rst_n <= 1; - end - - uwire ap_clk = clk; - - // Generate activations - typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t; - typedef activation_t activation_vector_t[SF]; - - function activation_vector_t init_ACTIVATIONS; - automatic activation_vector_t res; - std::randomize(res); - return res; - endfunction : init_ACTIVATIONS - - activation_vector_t ACTIVATIONS = init_ACTIVATIONS(); - - struct { - activation_t dat; - logic vld; - logic rdy; - } activations; - - initial begin - activations.vld = 0; - activations.dat = '1; // Since ('X AND 0) will result in 'X in simulation, which would be propagated through the DSP chain - @(posedge clk iff ap_rst_n); - - for (int i=0; i 1; - @(posedge clk); - end while (!(activations.vld === 1 && activations.rdy === 1)); - end - - activations.vld <= 0; - activations.dat <= 'x; - end - - // Generate weights - typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t; - typedef weight_t weight_matrix_t[NF][SF]; - - function weight_matrix_t init_WEIGHTS; - automatic weight_matrix_t res; - std::randomize(res); - return res; - endfunction : init_WEIGHTS; - - weight_matrix_t WEIGHTS = init_WEIGHTS(); - - struct { - weight_t dat; - logic vld; - logic rdy; - } weights; - - initial begin - weights.vld = 0; - weights.dat = '1; // Since ('X AND 0) will result in 'X in simulation, which would be propagated through the DSP chain - @(posedge clk iff ap_rst_n); - - weights.vld <= 1; - for (int i=0; i= 1; - @(posedge clk iff ap_rst_n); - end while (!(outputs.rdy === 1 && outputs.vld === 1)); - - // Compare produced outputs against golden outputs - foreach(outputs.dat[i]) begin - assert ($signed(outputs.dat[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); - else begin - $error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); - $stop; - end - end - - NF_CNT += 1; - end - - $finish; - end - - // Instantiate DUT - mvu_8sx9_axi #( - .MW(MW), - .MH(MH), - .PE(PE), - .SIMD(SIMD), - .ACTIVATION_WIDTH(ACTIVATION_WIDTH), - .WEIGHT_WIDTH(WEIGHT_WIDTH), - .ACCU_WIDTH(ACCU_WIDTH), - .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), - .SEGMENTLEN(SEGMENTLEN) - ) - dut ( - .ap_clk, .ap_rst_n, .s_axis_weights_tdata({ {WEIGHT_WIDTH_BA_DELTA{1'b0}}, weights.dat }), .s_axis_weights_tvalid(weights.vld), - .s_axis_weights_tready(weights.rdy), .s_axis_input_tdata({ {ACTIVATION_WIDTH_BA_DELTA{1'b0}}, activations.dat }), .s_axis_input_tvalid(activations.vld), - .s_axis_input_tready(activations.rdy), .m_axis_output_tdata(outputs.dat), .m_axis_output_tvalid(outputs.vld), - .m_axis_output_tready(outputs.rdy) - ); +//-------------------- Simulation parameters --------------------\\ + // Matrix & parallelism config + localparam int unsigned MW = 600; + localparam int unsigned MH = 256; + localparam int unsigned SIMD = 60; + localparam int unsigned PE = 16; + localparam int unsigned SEGMENTLEN = 4; + // Bit-width config + localparam int unsigned ACTIVATION_WIDTH = 8; + localparam int unsigned WEIGHT_WIDTH = 4; + localparam int unsigned ACCU_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW); + localparam bit SIGNED_ACTIVATIONS = 1; + // Simulation constants + localparam int unsigned NF = MH/PE; + localparam int unsigned SF = MW/SIMD; + localparam int unsigned NUM_OF_DSP = SIMD/3; + localparam int unsigned WEIGHT_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8*8; + localparam int unsigned ACTIVATION_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8*8; + localparam int unsigned WEIGHT_WIDTH_BA_DELTA = WEIGHT_WIDTH_BA - PE*SIMD*WEIGHT_WIDTH; + localparam int unsigned ACTIVATION_WIDTH_BA_DELTA = ACTIVATION_WIDTH_BA - SIMD*ACTIVATION_WIDTH; + localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8; + + // Generate clk and reset signal + logic clk = 0; + always #5ns clk = !clk; + + logic ap_rst_n = 0; + initial begin + repeat(16) @(posedge clk); + ap_rst_n <= 1; + end + + uwire ap_clk = clk; + + // Generate activations + typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t; + typedef activation_t activation_vector_t[SF]; + + function activation_vector_t init_ACTIVATIONS; + automatic activation_vector_t res; + std::randomize(res); + return res; + endfunction : init_ACTIVATIONS + + activation_vector_t ACTIVATIONS = init_ACTIVATIONS(); + + struct { + activation_t dat; + logic vld; + logic rdy; + } activations; + + initial begin + activations.vld = 0; + activations.dat = '1; // Since ('X AND 0) will result in 'X in simulation, which would be propagated through the DSP chain + @(posedge clk iff ap_rst_n); + + for (int i=0; i 1; + @(posedge clk); + end while (!(activations.vld === 1 && activations.rdy === 1)); + end + + activations.vld <= 0; + activations.dat <= 'x; + end + + // Generate weights + typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t; + typedef weight_t weight_matrix_t[NF][SF]; + + function weight_matrix_t init_WEIGHTS; + automatic weight_matrix_t res; + std::randomize(res); + return res; + endfunction : init_WEIGHTS; + + weight_matrix_t WEIGHTS = init_WEIGHTS(); + + struct { + weight_t dat; + logic vld; + logic rdy; + } weights; + + initial begin + weights.vld = 0; + weights.dat = '1; // Since ('X AND 0) will result in 'X in simulation, which would be propagated through the DSP chain + @(posedge clk iff ap_rst_n); + + weights.vld <= 1; + for (int i=0; i= 1; + @(posedge clk iff ap_rst_n); + end while (!(outputs.rdy === 1 && outputs.vld === 1)); + + // Compare produced outputs against golden outputs + foreach(outputs.dat[i]) begin + assert ($signed(outputs.dat[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); + else begin + $error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); + $stop; + end + end + + NF_CNT += 1; + end + + $finish; + end + + // Instantiate DUT + mvu_8sx9_axi #( + .MW(MW), + .MH(MH), + .PE(PE), + .SIMD(SIMD), + .ACTIVATION_WIDTH(ACTIVATION_WIDTH), + .WEIGHT_WIDTH(WEIGHT_WIDTH), + .ACCU_WIDTH(ACCU_WIDTH), + .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), + .SEGMENTLEN(SEGMENTLEN) + ) + dut ( + .ap_clk, .ap_rst_n, .s_axis_weights_tdata({ {WEIGHT_WIDTH_BA_DELTA{1'b0}}, weights.dat }), .s_axis_weights_tvalid(weights.vld), + .s_axis_weights_tready(weights.rdy), .s_axis_input_tdata({ {ACTIVATION_WIDTH_BA_DELTA{1'b0}}, activations.dat }), .s_axis_input_tvalid(activations.vld), + .s_axis_input_tready(activations.rdy), .m_axis_output_tdata(outputs.dat), .m_axis_output_tvalid(outputs.vld), + .m_axis_output_tready(outputs.rdy) + ); endmodule diff --git a/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v b/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v index ff3779d211..2456eb3a47 100644 --- a/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v +++ b/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v @@ -33,7 +33,7 @@ module $MODULE_NAME_AXI_WRAPPER$ #( parameter MW = $MW$, - parameter MH = $MH$, + parameter MH = $MH$, parameter PE = $PE$, parameter SIMD = $SIMD$, parameter ACTIVATION_WIDTH = $ACTIVATION_WIDTH$, @@ -44,29 +44,29 @@ module $MODULE_NAME_AXI_WRAPPER$ #( parameter RAM_STYLE = $IBUF_RAM_STYLE$, // Safely deducible parameters - parameter WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8, - parameter INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8, - parameter OUTPUT_LANES = PE, - parameter OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8 + parameter WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8, + parameter INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8, + parameter OUTPUT_LANES = PE, + parameter OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8 )( - // Global Control - input logic ap_clk, - input logic ap_rst_n, + // Global Control + input logic ap_clk, + input logic ap_rst_n, // Weight Stream - input logic [WEIGHT_STREAM_WIDTH_BA-1:0] s_axis_weights_tdata, - input logic s_axis_weights_tvalid, + input logic [WEIGHT_STREAM_WIDTH_BA-1:0] s_axis_weights_tdata, + input logic s_axis_weights_tvalid, output logic s_axis_weights_tready, // Input Stream - input logic [INPUT_STREAM_WIDTH_BA-1:0] s_axis_input_tdata, - input logic s_axis_input_tvalid, + input logic [INPUT_STREAM_WIDTH_BA-1:0] s_axis_input_tdata, + input logic s_axis_input_tvalid, output logic s_axis_input_tready, // Output Stream output logic [OUTPUT_STREAM_WIDTH_BA-1:0] m_axis_output_tdata, output logic m_axis_output_tvalid, - input logic m_axis_output_tready + input logic m_axis_output_tready ); mvu_8sx9_axi #( diff --git a/finn-rtllib/mvu/mvu_8sx9_tb.sv b/finn-rtllib/mvu/mvu_8sx9_tb.sv index ea3ecbbd70..adf6a8f9c2 100644 --- a/finn-rtllib/mvu/mvu_8sx9_tb.sv +++ b/finn-rtllib/mvu/mvu_8sx9_tb.sv @@ -33,133 +33,133 @@ module mvu_8sx9_tb(); - //-------------------- Simulation parameters --------------------\\ - // Matrix & parallelism config - localparam int unsigned MH = 256; - localparam int unsigned PE = 16; - localparam int unsigned MW = 600; - localparam int unsigned SIMD = 60; - localparam int unsigned SEGMENTLEN = 4; - // Bit-width config - localparam int unsigned ACTIVATION_WIDTH = 8; - localparam int unsigned WEIGHT_WIDTH = 4; - localparam bit SIGNED_ACTIVATIONS = 1; - // Simulation constants - localparam int unsigned NF = MH/PE; - localparam int unsigned SF = MW/SIMD; - localparam int unsigned NUM_OF_DSP = SIMD/3; - - typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t; - typedef activation_t activation_vector_t[SF]; - - function activation_vector_t init_ACTIVATIONS; - automatic activation_vector_t res; - std::randomize(res); - return res; - endfunction : init_ACTIVATIONS - - typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t; - typedef weight_t weight_matrix_t[NF][SF]; - - function weight_matrix_t init_WEIGHTS; - automatic weight_matrix_t res; - std::randomize(res); - return res; - endfunction : init_WEIGHTS; - - typedef logic signed [PE-1:0][57:0] output_t; - typedef output_t output_vector_t [NF]; - - function output_vector_t check_output(activation_vector_t a, weight_matrix_t w); - automatic output_vector_t res = '{default: 0}; - for (int j = 0; j 1) && !rst; - end - - // Compare computed output against golden output when vld flag is raised by DUT - always_ff @(posedge clk iff (vld && en)) begin - foreach(p[i]) begin - assert ($signed(p[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(p[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); - else begin - $error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(p[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); - $stop; - end - end - NF_CNT += 1; - end - - // Instantiate DUT - mvu_8sx9 #( - .PE(PE), - .SIMD(SIMD), - .WEIGHT_WIDTH(WEIGHT_WIDTH), - .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), - .ACTIVATION_WIDTH(ACTIVATION_WIDTH), - .SEGMENTLEN(SEGMENTLEN) - ) - dut ( - .clk, .rst, .en, .last, .zero, .a, .w, .vld, .p - ); - +//-------------------- Simulation parameters --------------------\\ + // Matrix & parallelism config + localparam int unsigned MH = 256; + localparam int unsigned PE = 16; + localparam int unsigned MW = 600; + localparam int unsigned SIMD = 60; + localparam int unsigned SEGMENTLEN = 4; + // Bit-width config + localparam int unsigned ACTIVATION_WIDTH = 8; + localparam int unsigned WEIGHT_WIDTH = 4; + localparam bit SIGNED_ACTIVATIONS = 1; + // Simulation constants + localparam int unsigned NF = MH/PE; + localparam int unsigned SF = MW/SIMD; + localparam int unsigned NUM_OF_DSP = SIMD/3; + + typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t; + typedef activation_t activation_vector_t[SF]; + + function activation_vector_t init_ACTIVATIONS; + automatic activation_vector_t res; + std::randomize(res); + return res; + endfunction : init_ACTIVATIONS + + typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t; + typedef weight_t weight_matrix_t[NF][SF]; + + function weight_matrix_t init_WEIGHTS; + automatic weight_matrix_t res; + std::randomize(res); + return res; + endfunction : init_WEIGHTS; + + typedef logic signed [PE-1:0][57:0] output_t; + typedef output_t output_vector_t [NF]; + + function output_vector_t check_output(activation_vector_t a, weight_matrix_t w); + automatic output_vector_t res = '{default: 0}; + for (int j = 0; j 1) && !rst; + end + + // Compare computed output against golden output when vld flag is raised by DUT + always_ff @(posedge clk iff (vld && en)) begin + foreach(p[i]) begin + assert ($signed(p[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(p[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); + else begin + $error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(p[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); + $stop; + end + end + NF_CNT += 1; + end + + // Instantiate DUT + mvu_8sx9 #( + .PE(PE), + .SIMD(SIMD), + .WEIGHT_WIDTH(WEIGHT_WIDTH), + .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), + .ACTIVATION_WIDTH(ACTIVATION_WIDTH), + .SEGMENTLEN(SEGMENTLEN) + ) + dut ( + .clk, .rst, .en, .last, .zero, .a, .w, .vld, .p + ); + endmodule From 5e61f42afd991233153ee8b7fe0fb6e9e8ac562d Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Wed, 12 Apr 2023 08:54:45 +0100 Subject: [PATCH 009/123] [rtl custom op]: fix to indentation --- finn-rtllib/mvu/mvu_8sx9_axi.sv | 54 ++++++++++++++++----------------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/finn-rtllib/mvu/mvu_8sx9_axi.sv b/finn-rtllib/mvu/mvu_8sx9_axi.sv index 6c7eaeaeca..5f215927d8 100644 --- a/finn-rtllib/mvu/mvu_8sx9_axi.sv +++ b/finn-rtllib/mvu/mvu_8sx9_axi.sv @@ -32,25 +32,25 @@ *****************************************************************************/ module mvu_8sx9_axi #( - int unsigned MW, - int unsigned MH, - int unsigned PE, - int unsigned SIMD, - int unsigned ACTIVATION_WIDTH, - int unsigned WEIGHT_WIDTH, - int unsigned ACCU_WIDTH, - bit SIGNED_ACTIVATIONS = 0, - int unsigned SEGMENTLEN = 0, + int unsigned MW, + int unsigned MH, + int unsigned PE, + int unsigned SIMD, + int unsigned ACTIVATION_WIDTH, + int unsigned WEIGHT_WIDTH, + int unsigned ACCU_WIDTH, + bit SIGNED_ACTIVATIONS = 0, + int unsigned SEGMENTLEN = 0, parameter RAM_STYLE = "auto", - localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8, - localparam int unsigned INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8, + localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8, + localparam int unsigned INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8, localparam int unsigned WEIGHT_STREAM_WIDTH = PE*SIMD*WEIGHT_WIDTH, localparam int unsigned INPUT_STREAM_WIDTH = SIMD*ACTIVATION_WIDTH, - localparam int unsigned SF = MW/SIMD, + localparam int unsigned SF = MW/SIMD, localparam int unsigned NF = MH/PE, - localparam int unsigned OUTPUT_LANES = PE, - localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8 + localparam int unsigned OUTPUT_LANES = PE, + localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8 ) ( // Global Control @@ -76,31 +76,31 @@ module mvu_8sx9_axi #( //-------------------- Parameter sanity checks --------------------\\ initial begin if (MW % SIMD != 0) begin - $error("Matrix width (%0d) is not a multiple of SIMD (%0d).", MW, SIMD); - $finish; + $error("Matrix width (%0d) is not a multiple of SIMD (%0d).", MW, SIMD); + $finish; end if (MH % PE != 0) begin - $error("Matrix height (%0d) is not a multiple of PE (%0d).", MH, PE); - $finish; + $error("Matrix height (%0d) is not a multiple of PE (%0d).", MH, PE); + $finish; end if (ACTIVATION_WIDTH > 9) begin - $error("Activation width of %0d-bits exceeds maximum of 9-bits", ACTIVATION_WIDTH); - $finish; + $error("Activation width of %0d-bits exceeds maximum of 9-bits", ACTIVATION_WIDTH); + $finish; end if (WEIGHT_WIDTH > 8) begin - $error("Weight width of %0d-bits exceeds maximum of 8-bits", WEIGHT_WIDTH); - $finish; + $error("Weight width of %0d-bits exceeds maximum of 8-bits", WEIGHT_WIDTH); + $finish; end if (SIGNED_ACTIVATIONS == 0 && ACTIVATION_WIDTH==9) begin - $error("Activation width of %0d-bits exceeds maximum of 8-bits for unsigned numbers", ACTIVATION_WIDTH); - $finish; + $error("Activation width of %0d-bits exceeds maximum of 8-bits for unsigned numbers", ACTIVATION_WIDTH); + $finish; end if (SEGMENTLEN == 0) begin - $warning("Segment length of %0d defaults to chain length", SEGMENTLEN); + $warning("Segment length of %0d defaults to chain length", SEGMENTLEN); end if (SEGMENTLEN > (SIMD+2)/3) begin - $error("Segment length of %0d exceeds chain length of %0d", SEGMENTLEN, (SIMD+2)/3); - $finish; + $error("Segment length of %0d exceeds chain length of %0d", SEGMENTLEN, (SIMD+2)/3); + $finish; end end From cbee193d746763044a870bdf1af248bbe8d31156 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Wed, 12 Apr 2023 14:33:13 +0100 Subject: [PATCH 010/123] [rtl custom-op]: minor changes for compiler integration --- finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v b/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v index 2456eb3a47..502a72d3f2 100644 --- a/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v +++ b/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v @@ -41,7 +41,7 @@ module $MODULE_NAME_AXI_WRAPPER$ #( parameter ACCU_WIDTH = $ACCU_WIDTH$, parameter SIGNED_ACTIVATIONS = $SIGNED_ACTIVATIONS$, parameter SEGMENTLEN = $SEGMENTLEN$, - parameter RAM_STYLE = $IBUF_RAM_STYLE$, + parameter RAM_STYLE = "$IBUF_RAM_STYLE$", // Safely deducible parameters parameter WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8, @@ -85,6 +85,6 @@ mvu_8sx9_axi #( .m_axis_output_tdata(m_axis_output_tdata), .m_axis_output_tvalid(m_axis_output_tvalid), .m_axis_output_tready(m_axis_output_tready) -) +); -endmodule : mvau_8sx9_axi_wrapper \ No newline at end of file +endmodule : $MODULE_NAME_AXI_WRAPPER$ \ No newline at end of file From ba5e77bde008fff2a445d6ef469072dd67f67f42 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Wed, 12 Apr 2023 23:26:05 +0100 Subject: [PATCH 011/123] [rtl custom op]: moved testbenches to separate directory --- finn-rtllib/mvu/tb/mvu_8sx9_tb.sv | 165 +++++++++++++++++++++++ finn-rtllib/mvu/tb/mvu_axi_tb.sv | 213 ++++++++++++++++++++++++++++++ 2 files changed, 378 insertions(+) create mode 100644 finn-rtllib/mvu/tb/mvu_8sx9_tb.sv create mode 100644 finn-rtllib/mvu/tb/mvu_axi_tb.sv diff --git a/finn-rtllib/mvu/tb/mvu_8sx9_tb.sv b/finn-rtllib/mvu/tb/mvu_8sx9_tb.sv new file mode 100644 index 0000000000..c8bfe5370a --- /dev/null +++ b/finn-rtllib/mvu/tb/mvu_8sx9_tb.sv @@ -0,0 +1,165 @@ +/****************************************************************************** + * Copyright (C) 2022, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Testbench for MVU core compute kernel. + *****************************************************************************/ + +module mvu_8sx9_tb(); + +//-------------------- Simulation parameters --------------------\\ + // Matrix & parallelism config + localparam int unsigned MH = 256; + localparam int unsigned PE = 16; + localparam int unsigned MW = 600; + localparam int unsigned SIMD = 60; + localparam int unsigned SEGMENTLEN = 4; + // Bit-width config + localparam int unsigned ACTIVATION_WIDTH = 8; + localparam int unsigned WEIGHT_WIDTH = 4; + localparam bit SIGNED_ACTIVATIONS = 1; + // Simulation constants + localparam int unsigned NF = MH/PE; + localparam int unsigned SF = MW/SIMD; + localparam int unsigned NUM_OF_DSP = SIMD/3; + + typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t; + typedef activation_t activation_vector_t[SF]; + + function activation_vector_t init_ACTIVATIONS; + automatic activation_vector_t res; + std::randomize(res); + return res; + endfunction : init_ACTIVATIONS + + typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t; + typedef weight_t weight_matrix_t[NF][SF]; + + function weight_matrix_t init_WEIGHTS; + automatic weight_matrix_t res; + std::randomize(res); + return res; + endfunction : init_WEIGHTS; + + typedef logic signed [PE-1:0][57:0] output_t; + typedef output_t output_vector_t [NF]; + + function output_vector_t check_output(activation_vector_t a, weight_matrix_t w); + automatic output_vector_t res = '{default: 0}; + for (int j = 0; j 1) && !rst; + end + + // Compare computed output against golden output when vld flag is raised by DUT + always_ff @(posedge clk iff (vld && en)) begin + foreach(p[i]) begin + assert ($signed(p[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(p[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); + else begin + $error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(p[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); + $stop; + end + end + NF_CNT += 1; + end + + // Instantiate DUT + mvu_8sx9 #( + .PE(PE), + .SIMD(SIMD), + .WEIGHT_WIDTH(WEIGHT_WIDTH), + .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), + .ACTIVATION_WIDTH(ACTIVATION_WIDTH), + .SEGMENTLEN(SEGMENTLEN) + ) + dut ( + .clk, .rst, .en, .last, .zero, .a, .w, .vld, .p + ); + +endmodule : mvu_8sx9_tb diff --git a/finn-rtllib/mvu/tb/mvu_axi_tb.sv b/finn-rtllib/mvu/tb/mvu_axi_tb.sv new file mode 100644 index 0000000000..08a349da84 --- /dev/null +++ b/finn-rtllib/mvu/tb/mvu_axi_tb.sv @@ -0,0 +1,213 @@ +/****************************************************************************** + * Copyright (C) 2022, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Testbench for MVU AXI-lite interface wrapper. + *****************************************************************************/ + +module mvu_axi_tb(); + +//-------------------- Simulation parameters --------------------\\ + // Matrix & parallelism config + localparam int unsigned MW = 90; + localparam int unsigned MH = 16; + localparam int unsigned SIMD = 9; + localparam int unsigned PE = 4; + localparam int unsigned SEGMENTLEN = 1; + localparam string MVU_IMPL_STYLE = "mvu_8sx9"; + // Bit-width config + localparam int unsigned ACTIVATION_WIDTH = 8; + localparam int unsigned WEIGHT_WIDTH = 4; + localparam int unsigned ACCU_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW); + localparam bit SIGNED_ACTIVATIONS = 1; + // Simulation constants + localparam int unsigned NF = MH/PE; + localparam int unsigned SF = MW/SIMD; + localparam int unsigned NUM_OF_DSP = SIMD/3; + localparam int unsigned WEIGHT_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8*8; + localparam int unsigned ACTIVATION_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8*8; + localparam int unsigned WEIGHT_WIDTH_BA_DELTA = WEIGHT_WIDTH_BA - PE*SIMD*WEIGHT_WIDTH; + localparam int unsigned ACTIVATION_WIDTH_BA_DELTA = ACTIVATION_WIDTH_BA - SIMD*ACTIVATION_WIDTH; + localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8; + + // Generate clk and reset signal + logic clk = 0; + always #5ns clk = !clk; + + logic ap_rst_n = 0; + initial begin + repeat(16) @(posedge clk); + ap_rst_n <= 1; + end + + uwire ap_clk = clk; + + // Generate activations + typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t; + typedef activation_t activation_vector_t[SF]; + + function activation_vector_t init_ACTIVATIONS; + automatic activation_vector_t res; + std::randomize(res); + return res; + endfunction : init_ACTIVATIONS + + activation_vector_t ACTIVATIONS = init_ACTIVATIONS(); + + struct { + activation_t dat; + logic vld; + logic rdy; + } activations; + + initial begin + activations.vld = 0; + activations.dat = '1; // Since ('X AND 0) will result in 'X in simulation, which would be propagated through the DSP chain + @(posedge clk iff ap_rst_n); + + for (int i=0; i 1; + @(posedge clk); + end while (!(activations.vld === 1 && activations.rdy === 1)); + end + + activations.vld <= 0; + activations.dat <= 'x; + end + + // Generate weights + typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t; + typedef weight_t weight_matrix_t[NF][SF]; + + function weight_matrix_t init_WEIGHTS; + automatic weight_matrix_t res; + std::randomize(res); + return res; + endfunction : init_WEIGHTS; + + weight_matrix_t WEIGHTS = init_WEIGHTS(); + + struct { + weight_t dat; + logic vld; + logic rdy; + } weights; + + initial begin + weights.vld = 0; + weights.dat = '1; // Since ('X AND 0) will result in 'X in simulation, which would be propagated through the DSP chain + @(posedge clk iff ap_rst_n); + + weights.vld <= 1; + for (int i=0; i= 1; + @(posedge clk iff ap_rst_n); + end while (!(outputs.rdy === 1 && outputs.vld === 1)); + + // Compare produced outputs against golden outputs + foreach(outputs.dat[i]) begin + assert ($signed(outputs.dat[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); + else begin + $error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); + $stop; + end + end + + NF_CNT += 1; + end + + $finish; + end + + // Instantiate DUT + mvu_axi #( + .MW(MW), + .MH(MH), + .PE(PE), + .SIMD(SIMD), + .ACTIVATION_WIDTH(ACTIVATION_WIDTH), + .WEIGHT_WIDTH(WEIGHT_WIDTH), + .ACCU_WIDTH(ACCU_WIDTH), + .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), + .SEGMENTLEN(SEGMENTLEN), + .MVU_IMPL_STYLE(MVU_IMPL_STYLE) + ) + dut ( + .ap_clk, .ap_rst_n, .s_axis_weights_tdata({ {WEIGHT_WIDTH_BA_DELTA{1'b0}}, weights.dat }), .s_axis_weights_tvalid(weights.vld), + .s_axis_weights_tready(weights.rdy), .s_axis_input_tdata({ {ACTIVATION_WIDTH_BA_DELTA{1'b0}}, activations.dat }), .s_axis_input_tvalid(activations.vld), + .s_axis_input_tready(activations.rdy), .m_axis_output_tdata(outputs.dat), .m_axis_output_tvalid(outputs.vld), + .m_axis_output_tready(outputs.rdy) + ); + +endmodule : mvu_axi_tb From 69310b4e6d2ee4bf2e60b236582656fd7f364a6d Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Wed, 12 Apr 2023 23:27:50 +0100 Subject: [PATCH 012/123] [rtl custom op]: fixed output width to ACCU_WIDTH --- finn-rtllib/mvu/mvu_8sx9.sv | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/finn-rtllib/mvu/mvu_8sx9.sv b/finn-rtllib/mvu/mvu_8sx9.sv index d082d4fb2e..5af27ab0ce 100644 --- a/finn-rtllib/mvu/mvu_8sx9.sv +++ b/finn-rtllib/mvu/mvu_8sx9.sv @@ -36,19 +36,25 @@ module mvu_8sx9 #( int unsigned SIMD, int unsigned ACTIVATION_WIDTH, int unsigned WEIGHT_WIDTH, + int unsigned ACCU_WIDTH, bit SIGNED_ACTIVATIONS = 0, int unsigned SEGMENTLEN = 0 // Default to 0 (which implies a single segment) ) ( - input logic clk, + // Global Control + input logic clk, input logic rst, input logic en, + + // Input input logic last, - input logic zero, - input logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] a, - input logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] w, - output logic vld, - output logic [PE-1:0][57:0] p + input logic zero, // ignore current inputs and force this partial product to zero + input logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] w, // weights + input logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] a, // activations + + // Ouput + output logic vld, + output logic [PE-1:0][ACCU_WIDTH-1:0] p ); //-------------------- Declare global signals --------------------\\ @@ -146,7 +152,7 @@ module mvu_8sx9 #( uwire [57:0] pp; if (LAST) begin : genPOUT - assign p[j] = pp; + assign p[j] = pp[ACCU_WIDTH-1:0]; end DSP58 #( @@ -281,4 +287,4 @@ module mvu_8sx9 #( end : genDSPChain end : genDSPPE -endmodule +endmodule : mvu_8sx9 From cfcff0040c85a76d7c5a16b2bf1b6b966b62e87d Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Wed, 12 Apr 2023 23:29:06 +0100 Subject: [PATCH 013/123] [rtl custom op]: renamed file and added generic to switch between compute kernels --- finn-rtllib/mvu/mvu_axi.sv | 194 +++++++++++++++++++++++++++++++++++++ 1 file changed, 194 insertions(+) create mode 100644 finn-rtllib/mvu/mvu_axi.sv diff --git a/finn-rtllib/mvu/mvu_axi.sv b/finn-rtllib/mvu/mvu_axi.sv new file mode 100644 index 0000000000..5d8700738f --- /dev/null +++ b/finn-rtllib/mvu/mvu_axi.sv @@ -0,0 +1,194 @@ +/****************************************************************************** + * Copyright (C) 2022, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Matrix Vector Unit (MVU) AXI-lite interface wrapper. + *****************************************************************************/ + +module mvu_axi #( + int unsigned MW, + int unsigned MH, + int unsigned PE, + int unsigned SIMD, + int unsigned ACTIVATION_WIDTH, + int unsigned WEIGHT_WIDTH, + int unsigned ACCU_WIDTH, + bit SIGNED_ACTIVATIONS = 0, + int unsigned SEGMENTLEN = 0, + parameter RAM_STYLE = "auto", + parameter MVU_IMPL_STYLE, + + localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8, + localparam int unsigned INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8, + localparam int unsigned WEIGHT_STREAM_WIDTH = PE*SIMD*WEIGHT_WIDTH, + localparam int unsigned INPUT_STREAM_WIDTH = SIMD*ACTIVATION_WIDTH, + localparam int unsigned SF = MW/SIMD, + localparam int unsigned NF = MH/PE, + localparam int unsigned OUTPUT_LANES = PE, + localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8 +) +( + // Global Control + input logic ap_clk, + input logic ap_rst_n, + + // Weight Stream + input logic [WEIGHT_STREAM_WIDTH_BA-1:0] s_axis_weights_tdata, + input logic s_axis_weights_tvalid, + output logic s_axis_weights_tready, + + // Input Stream + input logic [INPUT_STREAM_WIDTH_BA-1:0] s_axis_input_tdata, + input logic s_axis_input_tvalid, + output logic s_axis_input_tready, + + // Output Stream + output logic [OUTPUT_STREAM_WIDTH_BA-1:0] m_axis_output_tdata, + output logic m_axis_output_tvalid, + input logic m_axis_output_tready +); + +//-------------------- Parameter sanity checks --------------------\\ + initial begin + if (MW % SIMD != 0) begin + $error("Matrix width (%0d) is not a multiple of SIMD (%0d).", MW, SIMD); + $finish; + end + if (MH % PE != 0) begin + $error("Matrix height (%0d) is not a multiple of PE (%0d).", MH, PE); + $finish; + end + if (ACTIVATION_WIDTH > 9) begin + $error("Activation width of %0d-bits exceeds maximum of 9-bits", ACTIVATION_WIDTH); + $finish; + end + if (WEIGHT_WIDTH > 8) begin + $error("Weight width of %0d-bits exceeds maximum of 8-bits", WEIGHT_WIDTH); + $finish; + end + if (SIGNED_ACTIVATIONS == 0 && ACTIVATION_WIDTH==9) begin + $error("Activation width of %0d-bits exceeds maximum of 8-bits for unsigned numbers", ACTIVATION_WIDTH); + $finish; + end + if (SEGMENTLEN == 0) begin + $warning("Segment length of %0d defaults to chain length", SEGMENTLEN); + end + if (SEGMENTLEN > (SIMD+2)/3) begin + $error("Segment length of %0d exceeds chain length of %0d", SEGMENTLEN, (SIMD+2)/3); + $finish; + end + end + + uwire clk = ap_clk; + uwire rst = !ap_rst_n; + + typedef logic [INPUT_STREAM_WIDTH-1 : 0] mvauin_t; + + uwire mvauin_t amvau; + uwire alast; + uwire afin; + uwire avld; + uwire ardy; + + replay_buffer #(.LEN(SF), .REP(NF), .W($bits(mvauin_t)), .RAM_STYLE(RAM_STYLE)) activation_replay ( + .clk, .rst, + .ivld(s_axis_input_tvalid), .irdy(s_axis_input_tready), .idat(mvauin_t'(s_axis_input_tdata)), + .ovld(avld), .ordy(ardy), .odat(amvau), .olast(alast), .ofin(afin) + ); + +//-------------------- Input control --------------------\\ + uwire en; + uwire istb = avld && s_axis_weights_tvalid; + assign ardy = en && s_axis_weights_tvalid; + assign s_axis_weights_tready = en && avld; + +//-------------------- Core MVU --------------------\\ + uwire ovld; + uwire [PE-1:0][ACCU_WIDTH-1:0] odat; + typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t; + + if (MVU_IMPL_STYLE == "mvu_8sx9") begin : genMVU8sx9 + mvu_8sx9 #(.PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), + .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN)) core ( + .clk, .rst, .en, + .last(alast), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau), + .vld(ovld), .p(odat) + ); + end + else if (MVU_IMPL_STYLE == "mvu_4sx4u") begin : genMVU4sx4u + mvu_4sx4u #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .FORCE_BEHAVIORAL(0)) core ( + .clk, .rst, .en, + .last(alast), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau), + .vld(ovld), .p(odat) + ); + end + //else begin + // $error("Unrecognized MVU_IMPL_STYLE!"); + // $finish; + //end + +//-------------------- Output register slice --------------------\\ + struct { + logic vld; + logic [PE-1:0][ACCU_WIDTH-1:0] dat; + } A = '{ vld: 0, default: 'x}; + + assign en = !A.vld || !ovld; + + uwire b_load; + always_ff @(posedge clk) begin + if(rst) A <= '{ vld: 0, default: 'x }; + else if(!A.vld || b_load) begin + A.vld <= ovld && en; + for(int unsigned i = 0; i < PE; i++) begin + // CR-1148862: + // A.dat[i] <= odat[i]; + automatic logic [ACCU_WIDTH-1:0] v = odat[i]; + A.dat[i] <= v[ACCU_WIDTH-1:0]; + end + end + end + + struct { + logic vld; + logic [PE-1:0][ACCU_WIDTH-1:0] dat; + } B = '{ vld: 0, default: 'x}; + + assign b_load = !B.vld || m_axis_output_tready; + always_ff @(posedge clk) begin + if(rst) B <= '{ default: 'x }; + else begin + if(b_load) B <= '{ vld: A.vld, dat: A.dat}; + end + end + + assign m_axis_output_tvalid = B.vld; + assign m_axis_output_tdata = B.dat; + +endmodule : mvu_axi \ No newline at end of file From 72b519691369b9ebc31983a6723485860837e37b Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Wed, 12 Apr 2023 23:29:45 +0100 Subject: [PATCH 014/123] [rtl custom op]: renamed file and added generic to switch between compute kernels --- finn-rtllib/mvu/mvu_axi_wrapper.v | 90 +++++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) create mode 100644 finn-rtllib/mvu/mvu_axi_wrapper.v diff --git a/finn-rtllib/mvu/mvu_axi_wrapper.v b/finn-rtllib/mvu/mvu_axi_wrapper.v new file mode 100644 index 0000000000..323d2711e4 --- /dev/null +++ b/finn-rtllib/mvu/mvu_axi_wrapper.v @@ -0,0 +1,90 @@ +/****************************************************************************** + * Copyright (C) 2022, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Verilog AXI-lite wrapper for MVU. + *****************************************************************************/ + +module $MODULE_NAME_AXI_WRAPPER$ #( + parameter MW = $MW$, + parameter MH = $MH$, + parameter PE = $PE$, + parameter SIMD = $SIMD$, + parameter ACTIVATION_WIDTH = $ACTIVATION_WIDTH$, + parameter WEIGHT_WIDTH = $WEIGHT_WIDTH$, + parameter ACCU_WIDTH = $ACCU_WIDTH$, + parameter SIGNED_ACTIVATIONS = $SIGNED_ACTIVATIONS$, + parameter SEGMENTLEN = $SEGMENTLEN$, + parameter RAM_STYLE = "$IBUF_RAM_STYLE$", + + // Safely deducible parameters + parameter WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8, + parameter INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8, + parameter OUTPUT_LANES = PE, + parameter OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8 +)( + // Global Control + input logic ap_clk, + input logic ap_rst_n, + + // Weight Stream + input logic [WEIGHT_STREAM_WIDTH_BA-1:0] s_axis_weights_tdata, + input logic s_axis_weights_tvalid, + output logic s_axis_weights_tready, + + // Input Stream + input logic [INPUT_STREAM_WIDTH_BA-1:0] s_axis_input_tdata, + input logic s_axis_input_tvalid, + output logic s_axis_input_tready, + + // Output Stream + output logic [OUTPUT_STREAM_WIDTH_BA-1:0] m_axis_output_tdata, + output logic m_axis_output_tvalid, + input logic m_axis_output_tready +); + +mvu_axi #( + .MW(MW), .MH(MH), .PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), + .WEIGHT_WIDTH(WEIGHT_WIDTH), .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), + .SEGMENTLEN(SEGMENTLEN), .RAM_STYLE(RAM_STYLE) + ) inst ( + .ap_clk(ap_clk), + .ap_rst_n(ap_rst_n), + .s_axis_weights_tdata(s_axis_weights_tdata), + .s_axis_weights_tvalid(s_axis_weights_tvalid), + .s_axis_weights_tready(s_axis_weights_tready), + .s_axis_input_tdata(s_axis_input_tdata), + .s_axis_input_tvalid(s_axis_input_tvalid), + .s_axis_input_tready(s_axis_input_tready), + .m_axis_output_tdata(m_axis_output_tdata), + .m_axis_output_tvalid(m_axis_output_tvalid), + .m_axis_output_tready(m_axis_output_tready) +); + +endmodule : $MODULE_NAME_AXI_WRAPPER$ \ No newline at end of file From c068bb65c6a4b877876c5b1278e7b2663b81d8e1 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 8 May 2023 10:15:16 +0100 Subject: [PATCH 015/123] [rtl mvu]: added behavioral model DSP58 --- finn-rtllib/mvu/mvu_8sx9.sv | 343 ++++++++++++++++++++++-------------- 1 file changed, 212 insertions(+), 131 deletions(-) diff --git a/finn-rtllib/mvu/mvu_8sx9.sv b/finn-rtllib/mvu/mvu_8sx9.sv index 5af27ab0ce..2d1da26efb 100644 --- a/finn-rtllib/mvu/mvu_8sx9.sv +++ b/finn-rtllib/mvu/mvu_8sx9.sv @@ -38,7 +38,8 @@ module mvu_8sx9 #( int unsigned WEIGHT_WIDTH, int unsigned ACCU_WIDTH, bit SIGNED_ACTIVATIONS = 0, - int unsigned SEGMENTLEN = 0 // Default to 0 (which implies a single segment) + int unsigned SEGMENTLEN = 0, // Default to 0 (which implies a single segment) + bit FORCE_BEHAVIORAL = 0 ) ( // Global Control @@ -70,7 +71,10 @@ module mvu_8sx9 #( always_ff @(posedge clk) begin if(rst) L <= '{default: 0}; - else if(en) L <= { L[1:1+MAX_PIPELINE_STAGES], last }; + else if(en) begin + L[1+MAX_PIPELINE_STAGES] <= last; + L[0:MAX_PIPELINE_STAGES] <= L[1:1+MAX_PIPELINE_STAGES]; + end end assign vld = L[0]; @@ -155,135 +159,212 @@ module mvu_8sx9 #( assign p[j] = pp[ACCU_WIDTH-1:0]; end - DSP58 #( - // Feature Control Attributes: Data Path Selection - .AMULTSEL("A"), // Selects A input to multiplier (A, AD) - .A_INPUT("DIRECT"), // Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port) - .BMULTSEL("B"), // Selects B input to multiplier (AD, B) - .B_INPUT("DIRECT"), // Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port) - .DSP_MODE("INT8"), // Configures DSP to a particular mode of operation. Set to INT24 for - // legacy mode. - .PREADDINSEL("A"), // Selects input to pre-adder (A, B) - .RND(58'h000000000000000), // Rounding Constant - .USE_MULT("MULTIPLY"), // Select multiplier usage (DYNAMIC, MULTIPLY, NONE) - .USE_SIMD("ONE58"), // SIMD selection (FOUR12, ONE58, TWO24) - .USE_WIDEXOR("FALSE"), // Use the Wide XOR function (FALSE, TRUE) - .XORSIMD("XOR24_34_58_116"), // Mode of operation for the Wide XOR (XOR12_22, XOR24_34_58_116) - // Pattern Detector Attributes: Pattern Detection Configuration - .AUTORESET_PATDET("NO_RESET"), // NO_RESET, RESET_MATCH, RESET_NOT_MATCH - .AUTORESET_PRIORITY("RESET"), // Priority of AUTORESET vs. CEP (CEP, RESET). - .MASK(58'h0ffffffffffffff), // 58-bit mask value for pattern detect (1=ignore) - .PATTERN(58'h000000000000000), // 58-bit pattern match for pattern detect - .SEL_MASK("MASK"), // C, MASK, ROUNDING_MODE1, ROUNDING_MODE2 - .SEL_PATTERN("PATTERN"), // Select pattern value (C, PATTERN) - .USE_PATTERN_DETECT("NO_PATDET"), // Enable pattern detect (NO_PATDET, PATDET) - // Programmable Inversion Attributes: Specifies built-in programmable inversion on specific pins - .IS_ALUMODE_INVERTED(4'b0000), // Optional inversion for ALUMODE - .IS_CARRYIN_INVERTED(1'b0), // Optional inversion for CARRYIN - .IS_CLK_INVERTED(1'b0), // Optional inversion for CLK - .IS_INMODE_INVERTED(5'b00000), // Optional inversion for INMODE - .IS_NEGATE_INVERTED(3'b000), // Optional inversion for NEGATE - .IS_OPMODE_INVERTED({ LAST ? 2'b01 : 2'b00 , // W: LAST ? (L[1] ? 0 : P) : 0 - FIRST ? 3'b000 : 3'b001, // Z: FIRST ? 0 : PCIN - 2'b01, // Y : M - 2'b01 // X: M - }), // Optional inversion for OPMODE - .IS_RSTALLCARRYIN_INVERTED(1'b0), // Optional inversion for RSTALLCARRYIN - .IS_RSTALUMODE_INVERTED(1'b0), // Optional inversion for RSTALUMODE - .IS_RSTA_INVERTED(1'b0), // Optional inversion for RSTA - .IS_RSTB_INVERTED(1'b0), // Optional inversion for RSTB - .IS_RSTCTRL_INVERTED(1'b0), // Optional inversion for STCONJUGATE_A - .IS_RSTC_INVERTED(1'b0), // Optional inversion for RSTC - .IS_RSTD_INVERTED(1'b0), // Optional inversion for RSTD - .IS_RSTINMODE_INVERTED(1'b0), // Optional inversion for RSTINMODE - .IS_RSTM_INVERTED(1'b0), // Optional inversion for RSTM - .IS_RSTP_INVERTED(1'b0), // Optional inversion for RSTP - // Register Control Attributes: Pipeline Register Configuration - .ACASCREG(INTERNAL_PREGS), // Number of pipeline stages between A/ACIN and ACOUT (0-2) - .ADREG(0), // Pipeline stages for pre-adder (0-1) - .ALUMODEREG(0), // Pipeline stages for ALUMODE (0-1) - .AREG(INTERNAL_PREGS), // Pipeline stages for A (0-2) - .BCASCREG(INTERNAL_PREGS), // Number of pipeline stages between B/BCIN and BCOUT (0-2) - .BREG(INTERNAL_PREGS), // Pipeline stages for B (0-2) - .CARRYINREG(0), // Pipeline stages for CARRYIN (0-1) - .CARRYINSELREG(0), // Pipeline stages for CARRYINSEL (0-1) - .CREG(0), // Pipeline stages for C (0-1) - .DREG(0), // Pipeline stages for D (0-1) - .INMODEREG(1), // Pipeline stages for INMODE (0-1) - .MREG(1), // Multiplier pipeline stages (0-1) - .OPMODEREG(1), // Pipeline stages for OPMODE (0-1) - .PREG(PREG), // Number of pipeline stages for P (0-1) - .RESET_MODE("SYNC") // Selection of synchronous or asynchronous reset. (ASYNC, SYNC). - ) - DSP58_inst ( - // Cascade outputs: Cascade Ports - .ACOUT(), // 34-bit output: A port cascade - .BCOUT(), // 24-bit output: B cascade - .CARRYCASCOUT(), // 1-bit output: Cascade carry - .MULTSIGNOUT(), // 1-bit output: Multiplier sign cascade - .PCOUT(pcout[j][i]), // 58-bit output: Cascade output - // Control outputs: Control Inputs/Status Bits - .OVERFLOW(), // 1-bit output: Overflow in add/acc - .PATTERNBDETECT(), // 1-bit output: Pattern bar detect - .PATTERNDETECT(), // 1-bit output: Pattern detect - .UNDERFLOW(), // 1-bit output: Underflow in add/acc - // Data outputs: Data Ports - .CARRYOUT(), // 4-bit output: Carry - .P(pp), // 58-bit output: Primary data - .XOROUT(), // 8-bit output: XOR data - // Cascade inputs: Cascade Ports - .ACIN('x), // 34-bit input: A cascade data - .BCIN('x), // 24-bit input: B cascade - .CARRYCASCIN('x), // 1-bit input: Cascade carry - .MULTSIGNIN('x), // 1-bit input: Multiplier sign cascade - .PCIN(FIRST ? 'x : pcout[j][i-1]), // 58-bit input: P cascade - // Control inputs: Control Inputs/Status Bits - .ALUMODE(4'h0), // 4-bit input: ALU control - .CARRYINSEL('0), // 3-bit input: Carry select - .CLK(clk), // 1-bit input: Clock - .INMODE({ - INTERNAL_PREGS==2 ? 1'b0 : 1'b1, - 2'b00, - TOTAL_PREGS > 0 ? Z[TOTAL_PREGS-1] : zero, - INTERNAL_PREGS==2 ? 1'b0 : 1'b1 - }), // 5-bit input: INMODE control - .NEGATE('0), // 3-bit input: Negates the input of the multiplier - .OPMODE({ - LAST ? {1'b0, L[1]} : 2'b00, - 7'b000_0000 - }), // 9-bit input: Operation mode - // Data inputs: Data Ports - .A({ 7'bx, a_in_i[i] }), // 34-bit input: A data - .B(b_in_i[j][i]), // 24-bit input: B data - .C('x), // 58-bit input: C data - .CARRYIN('0), // 1-bit input: Carry-in - .D('x), // 27-bit input: D data - // Reset/Clock Enable inputs: Reset/Clock Enable Inputs - .ASYNC_RST('0), // 1-bit input: Asynchronous reset for all registers. - .CEA1(en), // 1-bit input: Clock enable for 1st stage AREG - .CEA2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage AREG - .CEAD('0), // 1-bit input: Clock enable for ADREG - .CEALUMODE('0), // 1-bit input: Clock enable for ALUMODE - .CEB1(en), // 1-bit input: Clock enable for 1st stage BREG - .CEB2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage BREG - .CEC('0), // 1-bit input: Clock enable for CREG - .CECARRYIN('0), // 1-bit input: Clock enable for CARRYINREG - .CECTRL(en), // 1-bit input: Clock enable for OPMODEREG and CARRYINSELREG - .CED('0), // 1-bit input: Clock enable for DREG - .CEINMODE(en), // 1-bit input: Clock enable for INMODEREG - .CEM(en), // 1-bit input: Clock enable for MREG - .CEP(PREG && en), // 1-bit input: Clock enable for PREG - .RSTA(rst), // 1-bit input: Reset for AREG - .RSTALLCARRYIN('0), // 1-bit input: Reset for CARRYINREG - .RSTALUMODE('0), // 1-bit input: Reset for ALUMODEREG - .RSTB(rst), // 1-bit input: Reset for BREG - .RSTC('0), // 1-bit input: Reset for CREG - .RSTCTRL(rst), // 1-bit input: Reset for OPMODEREG and CARRYINSELREG - .RSTD('0), // 1-bit input: Reset for DREG and ADREG - .RSTINMODE(rst), // 1-bit input: Reset for INMODE register - .RSTM(rst), // 1-bit input: Reset for MREG - .RSTP(PREG && rst) // 1-bit input: Reset for PREG - ); + // Note: Since the product B * AD is computed, + // rst can be only applied to AD and zero only to B + // with the same effect as zeroing both. + if (FORCE_BEHAVIORAL) begin : genBehav + // Stage #1: Input A/B + logic signed [33:0] Areg [INTERNAL_PREGS]; + always_ff @(posedge clk) begin + if (rst) Areg <= '{ default : 0}; + else if (en) begin + Areg[0] <= { 7'bx, a_in_i[i] }; + if (INTERNAL_PREGS == 2) Areg[1] <= Areg[0]; + end + end + logic signed [23:0] Breg [INTERNAL_PREGS]; + always_ff @(posedge clk) begin + if (rst) Breg <= '{ default : 0}; + else if (en) begin + Breg[0] <= b_in_i[j][i]; + if (INTERNAL_PREGS == 2) Breg[1] <= Breg[0]; + end + end + + // Stage #2: Multiply-Accumulate + logic signed [57:0] Mreg; + logic InmodeZero = 0; + always_ff @(posedge clk) begin + if (rst) InmodeZero <= 0; + else if (en) InmodeZero <= ( TOTAL_PREGS > 0 ? Z[TOTAL_PREGS-1] : zero ); + end + always_ff @(posedge clk) begin + if (rst) Mreg <= 0; + else if (en) begin + automatic logic signed [57:0] m = 0; + for (int k = 0; k < 3; k++) begin + m = m + (InmodeZero ? 0 : $signed(Areg[INTERNAL_PREGS-1][9*k +: 9]) * $signed(Breg[INTERNAL_PREGS-1][8*k +: 8])); + end + Mreg <= m; + end + end + + // Stage #3: Accumulate + logic signed [57:0] Preg; + logic Opmode = 0; + if (FIRST && !LAST) begin : genFirst + if (PREG) begin : genPregBehav + always_ff @(posedge clk) begin + if (rst) Preg <= 0; + else if (en) Preg <= Mreg; + end + end + else assign Preg = Mreg; + end + else if (LAST) begin : genLast + always_ff @(posedge clk) begin + if (rst) Opmode <= 0; + else if (en) Opmode <= L[1]; + end + always_ff @(posedge clk) begin + if (rst) Preg <= 0; + else if (en) Preg <= (Opmode ? 0 : Preg) + Mreg + pcout[j][i-1]; + end + end + else begin : genMid + if (PREG) begin : genPregBehav + always_ff @(posedge clk) begin + if (rst) Preg <= 0; + else if (en) Preg <= Mreg + pcout[j][i-1]; + end + end + else assign Preg = Mreg + pcout[j][i-1]; + end + assign pp = Preg; + assign pcout[j][i] = pp; + end : genBehav + + else begin: genDSP + DSP58 #( + // Feature Control Attributes: Data Path Selection + .AMULTSEL("A"), // Selects A input to multiplier (A, AD) + .A_INPUT("DIRECT"), // Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port) + .BMULTSEL("B"), // Selects B input to multiplier (AD, B) + .B_INPUT("DIRECT"), // Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port) + .DSP_MODE("INT8"), // Configures DSP to a particular mode of operation. Set to INT24 for + // legacy mode. + .PREADDINSEL("A"), // Selects input to pre-adder (A, B) + .RND(58'h000000000000000), // Rounding Constant + .USE_MULT("MULTIPLY"), // Select multiplier usage (DYNAMIC, MULTIPLY, NONE) + .USE_SIMD("ONE58"), // SIMD selection (FOUR12, ONE58, TWO24) + .USE_WIDEXOR("FALSE"), // Use the Wide XOR function (FALSE, TRUE) + .XORSIMD("XOR24_34_58_116"), // Mode of operation for the Wide XOR (XOR12_22, XOR24_34_58_116) + // Pattern Detector Attributes: Pattern Detection Configuration + .AUTORESET_PATDET("NO_RESET"), // NO_RESET, RESET_MATCH, RESET_NOT_MATCH + .AUTORESET_PRIORITY("RESET"), // Priority of AUTORESET vs. CEP (CEP, RESET). + .MASK(58'h0ffffffffffffff), // 58-bit mask value for pattern detect (1=ignore) + .PATTERN(58'h000000000000000), // 58-bit pattern match for pattern detect + .SEL_MASK("MASK"), // C, MASK, ROUNDING_MODE1, ROUNDING_MODE2 + .SEL_PATTERN("PATTERN"), // Select pattern value (C, PATTERN) + .USE_PATTERN_DETECT("NO_PATDET"), // Enable pattern detect (NO_PATDET, PATDET) + // Programmable Inversion Attributes: Specifies built-in programmable inversion on specific pins + .IS_ALUMODE_INVERTED(4'b0000), // Optional inversion for ALUMODE + .IS_CARRYIN_INVERTED(1'b0), // Optional inversion for CARRYIN + .IS_CLK_INVERTED(1'b0), // Optional inversion for CLK + .IS_INMODE_INVERTED(5'b00000), // Optional inversion for INMODE + .IS_NEGATE_INVERTED(3'b000), // Optional inversion for NEGATE + .IS_OPMODE_INVERTED({ LAST ? 2'b01 : 2'b00 , // W: LAST ? (L[1] ? 0 : P) : 0 + FIRST ? 3'b000 : 3'b001, // Z: FIRST ? 0 : PCIN + 2'b01, // Y : M + 2'b01 // X: M + }), // Optional inversion for OPMODE + .IS_RSTALLCARRYIN_INVERTED(1'b0), // Optional inversion for RSTALLCARRYIN + .IS_RSTALUMODE_INVERTED(1'b0), // Optional inversion for RSTALUMODE + .IS_RSTA_INVERTED(1'b0), // Optional inversion for RSTA + .IS_RSTB_INVERTED(1'b0), // Optional inversion for RSTB + .IS_RSTCTRL_INVERTED(1'b0), // Optional inversion for STCONJUGATE_A + .IS_RSTC_INVERTED(1'b0), // Optional inversion for RSTC + .IS_RSTD_INVERTED(1'b0), // Optional inversion for RSTD + .IS_RSTINMODE_INVERTED(1'b0), // Optional inversion for RSTINMODE + .IS_RSTM_INVERTED(1'b0), // Optional inversion for RSTM + .IS_RSTP_INVERTED(1'b0), // Optional inversion for RSTP + // Register Control Attributes: Pipeline Register Configuration + .ACASCREG(INTERNAL_PREGS), // Number of pipeline stages between A/ACIN and ACOUT (0-2) + .ADREG(0), // Pipeline stages for pre-adder (0-1) + .ALUMODEREG(0), // Pipeline stages for ALUMODE (0-1) + .AREG(INTERNAL_PREGS), // Pipeline stages for A (0-2) + .BCASCREG(INTERNAL_PREGS), // Number of pipeline stages between B/BCIN and BCOUT (0-2) + .BREG(INTERNAL_PREGS), // Pipeline stages for B (0-2) + .CARRYINREG(0), // Pipeline stages for CARRYIN (0-1) + .CARRYINSELREG(0), // Pipeline stages for CARRYINSEL (0-1) + .CREG(0), // Pipeline stages for C (0-1) + .DREG(0), // Pipeline stages for D (0-1) + .INMODEREG(1), // Pipeline stages for INMODE (0-1) + .MREG(1), // Multiplier pipeline stages (0-1) + .OPMODEREG(1), // Pipeline stages for OPMODE (0-1) + .PREG(PREG), // Number of pipeline stages for P (0-1) + .RESET_MODE("SYNC") // Selection of synchronous or asynchronous reset. (ASYNC, SYNC). + ) + DSP58_inst ( + // Cascade outputs: Cascade Ports + .ACOUT(), // 34-bit output: A port cascade + .BCOUT(), // 24-bit output: B cascade + .CARRYCASCOUT(), // 1-bit output: Cascade carry + .MULTSIGNOUT(), // 1-bit output: Multiplier sign cascade + .PCOUT(pcout[j][i]), // 58-bit output: Cascade output + // Control outputs: Control Inputs/Status Bits + .OVERFLOW(), // 1-bit output: Overflow in add/acc + .PATTERNBDETECT(), // 1-bit output: Pattern bar detect + .PATTERNDETECT(), // 1-bit output: Pattern detect + .UNDERFLOW(), // 1-bit output: Underflow in add/acc + // Data outputs: Data Ports + .CARRYOUT(), // 4-bit output: Carry + .P(pp), // 58-bit output: Primary data + .XOROUT(), // 8-bit output: XOR data + // Cascade inputs: Cascade Ports + .ACIN('x), // 34-bit input: A cascade data + .BCIN('x), // 24-bit input: B cascade + .CARRYCASCIN('x), // 1-bit input: Cascade carry + .MULTSIGNIN('x), // 1-bit input: Multiplier sign cascade + .PCIN(FIRST ? 'x : pcout[j][i-1]), // 58-bit input: P cascade + // Control inputs: Control Inputs/Status Bits + .ALUMODE(4'h0), // 4-bit input: ALU control + .CARRYINSEL('0), // 3-bit input: Carry select + .CLK(clk), // 1-bit input: Clock + .INMODE({ + INTERNAL_PREGS==2 ? 1'b0 : 1'b1, + 2'b00, + TOTAL_PREGS > 0 ? Z[TOTAL_PREGS-1] : zero, + INTERNAL_PREGS==2 ? 1'b0 : 1'b1 + }), // 5-bit input: INMODE control + .NEGATE('0), // 3-bit input: Negates the input of the multiplier + .OPMODE({ + LAST ? {1'b0, L[1]} : 2'b00, + 7'b000_0000 + }), // 9-bit input: Operation mode + // Data inputs: Data Ports + .A({ 7'bx, a_in_i[i] }), // 34-bit input: A data + .B(b_in_i[j][i]), // 24-bit input: B data + .C('x), // 58-bit input: C data + .CARRYIN('0), // 1-bit input: Carry-in + .D('x), // 27-bit input: D data + // Reset/Clock Enable inputs: Reset/Clock Enable Inputs + .ASYNC_RST('0), // 1-bit input: Asynchronous reset for all registers. + .CEA1(en), // 1-bit input: Clock enable for 1st stage AREG + .CEA2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage AREG + .CEAD('0), // 1-bit input: Clock enable for ADREG + .CEALUMODE('0), // 1-bit input: Clock enable for ALUMODE + .CEB1(en), // 1-bit input: Clock enable for 1st stage BREG + .CEB2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage BREG + .CEC('0), // 1-bit input: Clock enable for CREG + .CECARRYIN('0), // 1-bit input: Clock enable for CARRYINREG + .CECTRL(en), // 1-bit input: Clock enable for OPMODEREG and CARRYINSELREG + .CED('0), // 1-bit input: Clock enable for DREG + .CEINMODE(en), // 1-bit input: Clock enable for INMODEREG + .CEM(en), // 1-bit input: Clock enable for MREG + .CEP(PREG && en), // 1-bit input: Clock enable for PREG + .RSTA(rst), // 1-bit input: Reset for AREG + .RSTALLCARRYIN('0), // 1-bit input: Reset for CARRYINREG + .RSTALUMODE('0), // 1-bit input: Reset for ALUMODEREG + .RSTB(rst), // 1-bit input: Reset for BREG + .RSTC('0), // 1-bit input: Reset for CREG + .RSTCTRL(rst), // 1-bit input: Reset for OPMODEREG and CARRYINSELREG + .RSTD('0), // 1-bit input: Reset for DREG and ADREG + .RSTINMODE(rst), // 1-bit input: Reset for INMODE register + .RSTM(rst), // 1-bit input: Reset for MREG + .RSTP(PREG && rst) // 1-bit input: Reset for PREG + ); + end : genDSP end : genDSPChain end : genDSPPE From 18f94e7ab03a3034083680faa91a80359858589e Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 8 May 2023 10:18:58 +0100 Subject: [PATCH 016/123] [rtl mvu]: extended flow control wrapper with additional compute core and other minor changes --- finn-rtllib/mvu/mvu_axi.sv | 51 +++++++++++++++++++------------ finn-rtllib/mvu/mvu_axi_wrapper.v | 48 ++++++++++++++--------------- 2 files changed, 54 insertions(+), 45 deletions(-) diff --git a/finn-rtllib/mvu/mvu_axi.sv b/finn-rtllib/mvu/mvu_axi.sv index 5d8700738f..e4a919ba88 100644 --- a/finn-rtllib/mvu/mvu_axi.sv +++ b/finn-rtllib/mvu/mvu_axi.sv @@ -41,8 +41,8 @@ module mvu_axi #( int unsigned ACCU_WIDTH, bit SIGNED_ACTIVATIONS = 0, int unsigned SEGMENTLEN = 0, - parameter RAM_STYLE = "auto", - parameter MVU_IMPL_STYLE, + bit FORCE_BEHAVIORAL = 0, + string MVU_IMPL_STYLE, localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8, localparam int unsigned INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8, @@ -96,12 +96,14 @@ module mvu_axi #( $error("Activation width of %0d-bits exceeds maximum of 8-bits for unsigned numbers", ACTIVATION_WIDTH); $finish; end - if (SEGMENTLEN == 0) begin - $warning("Segment length of %0d defaults to chain length", SEGMENTLEN); - end - if (SEGMENTLEN > (SIMD+2)/3) begin - $error("Segment length of %0d exceeds chain length of %0d", SEGMENTLEN, (SIMD+2)/3); - $finish; + if (MVU_IMPL_STYLE == "mvu_8sx9") begin + if (SEGMENTLEN == 0) begin + $warning("Segment length of %0d defaults to chain length", SEGMENTLEN); + end + if (SEGMENTLEN > (SIMD+2)/3) begin + $error("Segment length of %0d exceeds chain length of %0d", SEGMENTLEN, (SIMD+2)/3); + $finish; + end end end @@ -116,7 +118,7 @@ module mvu_axi #( uwire avld; uwire ardy; - replay_buffer #(.LEN(SF), .REP(NF), .W($bits(mvauin_t)), .RAM_STYLE(RAM_STYLE)) activation_replay ( + replay_buffer #(.LEN(SF), .REP(NF), .W($bits(mvauin_t))) activation_replay ( .clk, .rst, .ivld(s_axis_input_tvalid), .irdy(s_axis_input_tready), .idat(mvauin_t'(s_axis_input_tdata)), .ovld(avld), .ordy(ardy), .odat(amvau), .olast(alast), .ofin(afin) @@ -133,28 +135,37 @@ module mvu_axi #( uwire [PE-1:0][ACCU_WIDTH-1:0] odat; typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t; - if (MVU_IMPL_STYLE == "mvu_8sx9") begin : genMVU8sx9 + if (MVU_IMPL_STYLE == "mvu_8sx9_dsp58") begin : genMVU8sx9 mvu_8sx9 #(.PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), - .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN)) core ( + .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN), + .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( .clk, .rst, .en, - .last(alast), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau), + .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau), .vld(ovld), .p(odat) ); end else if (MVU_IMPL_STYLE == "mvu_4sx4u") begin : genMVU4sx4u - mvu_4sx4u #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .FORCE_BEHAVIORAL(0)) core ( + mvu_4sx4u #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( .clk, .rst, .en, - .last(alast), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau), + .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau), .vld(ovld), .p(odat) ); end - //else begin - // $error("Unrecognized MVU_IMPL_STYLE!"); - // $finish; - //end + else if (MVU_IMPL_STYLE == "mvu_8sx8u_dsp48") begin : genMVU8sx8u + mvu_8sx8u_dsp48 #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), + .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( + .clk, .rst, .en, + .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau), + .vld(ovld), .p(odat) + ); + end + else initial begin + $error("Unrecognized MVU_IMPL_STYLE!"); + $finish; + end //-------------------- Output register slice --------------------\\ - struct { + struct packed { logic vld; logic [PE-1:0][ACCU_WIDTH-1:0] dat; } A = '{ vld: 0, default: 'x}; @@ -175,7 +186,7 @@ module mvu_axi #( end end - struct { + struct packed { logic vld; logic [PE-1:0][ACCU_WIDTH-1:0] dat; } B = '{ vld: 0, default: 'x}; diff --git a/finn-rtllib/mvu/mvu_axi_wrapper.v b/finn-rtllib/mvu/mvu_axi_wrapper.v index 323d2711e4..b79ba6bbd1 100644 --- a/finn-rtllib/mvu/mvu_axi_wrapper.v +++ b/finn-rtllib/mvu/mvu_axi_wrapper.v @@ -41,7 +41,8 @@ module $MODULE_NAME_AXI_WRAPPER$ #( parameter ACCU_WIDTH = $ACCU_WIDTH$, parameter SIGNED_ACTIVATIONS = $SIGNED_ACTIVATIONS$, parameter SEGMENTLEN = $SEGMENTLEN$, - parameter RAM_STYLE = "$IBUF_RAM_STYLE$", + parameter MVU_IMPL_STYLE = "$MVU_IMPL_STYLE$", + parameter FORCE_BEHAVIORAL = $FORCE_BEHAVIORAL$, // Safely deducible parameters parameter WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8, @@ -50,41 +51,38 @@ module $MODULE_NAME_AXI_WRAPPER$ #( parameter OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8 )( // Global Control - input logic ap_clk, - input logic ap_rst_n, - + input ap_clk, + input ap_rst_n, // Weight Stream - input logic [WEIGHT_STREAM_WIDTH_BA-1:0] s_axis_weights_tdata, - input logic s_axis_weights_tvalid, - output logic s_axis_weights_tready, - + input [WEIGHT_STREAM_WIDTH_BA-1:0] weights_V_TDATA, + input weights_V_TVALID, + output weights_V_TREADY, // Input Stream - input logic [INPUT_STREAM_WIDTH_BA-1:0] s_axis_input_tdata, - input logic s_axis_input_tvalid, - output logic s_axis_input_tready, - + input [INPUT_STREAM_WIDTH_BA-1:0] in0_V_TDATA, + input in0_V_TVALID, + output in0_V_TREADY, // Output Stream - output logic [OUTPUT_STREAM_WIDTH_BA-1:0] m_axis_output_tdata, - output logic m_axis_output_tvalid, - input logic m_axis_output_tready + output [OUTPUT_STREAM_WIDTH_BA-1:0] out_V_TDATA, + output out_V_TVALID, + input out_V_TREADY ); mvu_axi #( .MW(MW), .MH(MH), .PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), - .SEGMENTLEN(SEGMENTLEN), .RAM_STYLE(RAM_STYLE) + .SEGMENTLEN(SEGMENTLEN), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL), .MVU_IMPL_STYLE(MVU_IMPL_STYLE) ) inst ( .ap_clk(ap_clk), .ap_rst_n(ap_rst_n), - .s_axis_weights_tdata(s_axis_weights_tdata), - .s_axis_weights_tvalid(s_axis_weights_tvalid), - .s_axis_weights_tready(s_axis_weights_tready), - .s_axis_input_tdata(s_axis_input_tdata), - .s_axis_input_tvalid(s_axis_input_tvalid), - .s_axis_input_tready(s_axis_input_tready), - .m_axis_output_tdata(m_axis_output_tdata), - .m_axis_output_tvalid(m_axis_output_tvalid), - .m_axis_output_tready(m_axis_output_tready) + .s_axis_weights_tdata(weights_V_TDATA), + .s_axis_weights_tvalid(weights_V_TVALID), + .s_axis_weights_tready(weights_V_TREADY), + .s_axis_input_tdata(in0_V_TDATA), + .s_axis_input_tvalid(in0_V_TVALID), + .s_axis_input_tready(in0_V_TREADY), + .m_axis_output_tdata(out_V_TDATA), + .m_axis_output_tvalid(out_V_TVALID), + .m_axis_output_tready(out_V_TREADY) ); endmodule : $MODULE_NAME_AXI_WRAPPER$ \ No newline at end of file From 6d4a0a764e0e6ded16d7034e0d69f5408c76ca75 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 8 May 2023 10:22:51 +0100 Subject: [PATCH 017/123] [rtl mvu]: fix to done_len flag when SIMD dimension fully unrolled and PyVerilator-related syntax change --- finn-rtllib/mvu/replay_buffer.sv | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/finn-rtllib/mvu/replay_buffer.sv b/finn-rtllib/mvu/replay_buffer.sv index 685ac03137..89bbbdb88f 100644 --- a/finn-rtllib/mvu/replay_buffer.sv +++ b/finn-rtllib/mvu/replay_buffer.sv @@ -35,8 +35,7 @@ module replay_buffer #( int unsigned LEN, // Sequence length int unsigned REP, // Sequence replay count - int unsigned W, // Data width - parameter RAM_STYLE = "auto" // ram style for buffer {block, distributed, ultra, auto} + int unsigned W // Data width )( input logic clk, input logic rst, @@ -54,7 +53,7 @@ module replay_buffer #( typedef logic [$clog2(REP)+$clog2(LEN)-1:0] count_t; count_t Count = 0; - uwire done_len = ((LEN-1) & ~Count[$clog2(LEN)-1:0]) == 0; + uwire done_len = LEN == 1 ? 1 : ((LEN-1) & ~Count[$clog2(LEN)-1:0]) == 0; uwire done_rep; uwire done_all = done_len && done_rep; @@ -83,7 +82,6 @@ module replay_buffer #( end assign first_rep = FirstRep; - (* RAM_STYLE = RAM_STYLE *) data_t Buf[LEN]; if(LEN == 1) begin : genTrivial always_ff @(posedge clk) begin @@ -92,7 +90,10 @@ module replay_buffer #( end : genTrivial else begin : genShift always_ff @(posedge clk) begin - if(shift) Buf <= { odat, Buf[0:LEN-2] }; + if(shift) begin + Buf[0] <= odat; + Buf[1:LEN-1] <= Buf[0:LEN-2]; + end end end : genShift From 90c547d54756aed2aa101862fb6f55c05149173c Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 8 May 2023 10:23:22 +0100 Subject: [PATCH 018/123] [rtl mvu tb]: updated testbench --- finn-rtllib/mvu/tb/mvu_axi_tb.sv | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/finn-rtllib/mvu/tb/mvu_axi_tb.sv b/finn-rtllib/mvu/tb/mvu_axi_tb.sv index 08a349da84..ef5fa7d682 100644 --- a/finn-rtllib/mvu/tb/mvu_axi_tb.sv +++ b/finn-rtllib/mvu/tb/mvu_axi_tb.sv @@ -35,17 +35,18 @@ module mvu_axi_tb(); //-------------------- Simulation parameters --------------------\\ // Matrix & parallelism config - localparam int unsigned MW = 90; - localparam int unsigned MH = 16; - localparam int unsigned SIMD = 9; - localparam int unsigned PE = 4; - localparam int unsigned SEGMENTLEN = 1; - localparam string MVU_IMPL_STYLE = "mvu_8sx9"; + localparam int unsigned MW = 50; + localparam int unsigned MH = 8; + localparam int unsigned SIMD = 10; + localparam int unsigned PE = 2; + localparam int unsigned SEGMENTLEN = 2; + localparam string MVU_IMPL_STYLE = "mvu_8sx8u_dsp48"; + localparam bit FORCE_BEHAVIORAL = 1; // Bit-width config localparam int unsigned ACTIVATION_WIDTH = 8; - localparam int unsigned WEIGHT_WIDTH = 4; + localparam int unsigned WEIGHT_WIDTH = 8; localparam int unsigned ACCU_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW); - localparam bit SIGNED_ACTIVATIONS = 1; + localparam bit SIGNED_ACTIVATIONS = 0; // Simulation constants localparam int unsigned NF = MH/PE; localparam int unsigned SF = MW/SIMD; @@ -94,7 +95,7 @@ module mvu_axi_tb(); for (int i=0; i 1; + activations.vld = $urandom()%7 >= 1; @(posedge clk); end while (!(activations.vld === 1 && activations.rdy === 1)); end @@ -201,6 +202,7 @@ module mvu_axi_tb(); .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN), + .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL), .MVU_IMPL_STYLE(MVU_IMPL_STYLE) ) dut ( From 0c37f1f7bed1143833649accceb59bd6821bed3c Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 8 May 2023 10:25:10 +0100 Subject: [PATCH 019/123] [builder]: added specialize_to_rtl step and changed standalone threshold layers to be by default true --- src/finn/builder/build_dataflow_config.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/finn/builder/build_dataflow_config.py b/src/finn/builder/build_dataflow_config.py index 4c3e4ff899..24940489df 100644 --- a/src/finn/builder/build_dataflow_config.py +++ b/src/finn/builder/build_dataflow_config.py @@ -121,6 +121,7 @@ class VerificationStepType(str, Enum): "step_apply_folding_config", "step_minimize_bit_width", "step_generate_estimate_reports", + "step_specialize_to_rtl", "step_hls_codegen", "step_hls_ipgen", "step_set_fifo_depths", @@ -233,7 +234,7 @@ class DataflowBuildConfig: #: activations in FINN) will be implemented as stand-alone HLS layers, #: instead of being part of MatrixVectorActivation layer. This gives larger #: flexibility, and makes it possible to have runtime-writable thresholds. - standalone_thresholds: Optional[bool] = False + standalone_thresholds: Optional[bool] = True #: (Optional) Whether optimizations that minimize the bit width of the #: weights and accumulator will be applied. Because this optimization relies From 5ccb016a640dbed6818a9f1f3ef46136ce949c0d Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 8 May 2023 10:26:03 +0100 Subject: [PATCH 020/123] [builder]: added specialize_to_rtl step --- src/finn/builder/build_dataflow_steps.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py index e43a29d632..3e4d047a51 100644 --- a/src/finn/builder/build_dataflow_steps.py +++ b/src/finn/builder/build_dataflow_steps.py @@ -123,6 +123,7 @@ ) from finn.util.pyverilator import verilator_fifosim from finn.util.test import execute_parent +import finn.transformation.fpgadataflow.specialize_to_rtl_layers as to_rtl def verify_step( @@ -483,6 +484,16 @@ def step_generate_estimate_reports(model: ModelWrapper, cfg: DataflowBuildConfig return model +def step_specialize_to_rtl(model: ModelWrapper, cfg: DataflowBuildConfig): + """Convert layers implemented in HLS to an equivalent specialized RTL implementation if possible.""" + specialize_to_rtl_transforms = [ + to_rtl.InferRTLMatrixVectorActivation() + ] + for trn in specialize_to_rtl_transforms: + model = model.transform(trn) + return model + + def step_minimize_bit_width(model: ModelWrapper, cfg: DataflowBuildConfig): """Tighten the weight and accumulator bit widths for each layer.""" if cfg.minimize_bit_width: @@ -855,6 +866,7 @@ def step_deployment_package(model: ModelWrapper, cfg: DataflowBuildConfig): "step_apply_folding_config": step_apply_folding_config, "step_minimize_bit_width": step_minimize_bit_width, "step_generate_estimate_reports": step_generate_estimate_reports, + "step_specialize_to_rtl": step_specialize_to_rtl, "step_hls_codegen": step_hls_codegen, "step_hls_ipgen": step_hls_ipgen, "step_set_fifo_depths": step_set_fifo_depths, From f099f4bbfd01b628a89c6099f637a4a85a8158ca Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 8 May 2023 10:26:44 +0100 Subject: [PATCH 021/123] [custom op]: added custom op MatrixVectorActivation_rtl --- src/finn/custom_op/fpgadataflow/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py index 56d4230a3a..19c0ddd999 100644 --- a/src/finn/custom_op/fpgadataflow/__init__.py +++ b/src/finn/custom_op/fpgadataflow/__init__.py @@ -49,6 +49,7 @@ from finn.custom_op.fpgadataflow.labelselect_batch import LabelSelect_Batch from finn.custom_op.fpgadataflow.lookup import Lookup from finn.custom_op.fpgadataflow.matrixvectoractivation import MatrixVectorActivation +from finn.custom_op.fpgadataflow.matrixvectoractivation_rtl import MatrixVectorActivation_rtl from finn.custom_op.fpgadataflow.pool_batch import Pool_Batch from finn.custom_op.fpgadataflow.streamingdataflowpartition import ( StreamingDataflowPartition, @@ -70,6 +71,7 @@ custom_op["DownSampler"] = DownSampler custom_op["StreamingMaxPool_Batch"] = StreamingMaxPool_Batch custom_op["MatrixVectorActivation"] = MatrixVectorActivation +custom_op["MatrixVectorActivation_rtl"] = MatrixVectorActivation_rtl custom_op["ConvolutionInputGenerator"] = ConvolutionInputGenerator custom_op["ConvolutionInputGenerator1D"] = ConvolutionInputGenerator1D custom_op["ConvolutionInputGenerator_rtl"] = ConvolutionInputGenerator_rtl From 9a3b0fdc54f8c7c1b541c8cfdaaf6e96315da092 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 8 May 2023 10:28:34 +0100 Subject: [PATCH 022/123] [custom op]: added additional attribute to enable conversion to RTL (custom-op) --- src/finn/custom_op/fpgadataflow/matrixvectoractivation.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py index aa987384dd..e54abb0c3f 100644 --- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py +++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py @@ -70,7 +70,7 @@ def get_nodeattr_types(self): "SIMD": ("i", True, 0), "MW": ("i", True, 0), "MH": ("i", True, 0), - "resType": ("s", False, "lut", {"auto", "lut", "dsp"}), + "resType": ("s", False, "dsp", {"auto", "lut", "dsp"}), "ActVal": ("i", False, 0), # FINN DataTypes for inputs, weights, outputs "inputDataType": ("s", True, ""), @@ -125,6 +125,8 @@ def get_nodeattr_types(self): # vector through the accelerator. This will get rid of any old # weight data from the weight FIFOs. "runtime_writeable_weights": ("i", False, 0, {0, 1}), + # Flag to specify whether RTL-based or HLS-based implementation is preferred + "impl": ("s", False, "rtl", {"hls", "rtl"}) } my_attrs.update(super().get_nodeattr_types()) return my_attrs From 38aa930baa1296a7099f9df22e3d0d000c8d5a05 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 8 May 2023 10:30:15 +0100 Subject: [PATCH 023/123] [custom op]: modified ip-stitching and code generation --- .../matrixvectoractivation_rtl.py | 231 ++++++++++-------- 1 file changed, 127 insertions(+), 104 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py index c8a0aa675b..6b1c2f3be7 100644 --- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py +++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py @@ -27,6 +27,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import math +from shutil import copy import numpy as np import os import textwrap @@ -45,6 +46,12 @@ pack_innermost_dim_as_hex_string, rtlsim_output_to_npy, ) +from finn.util.basic import get_rtlsim_trace_depth, make_build_dir + +try: + from pyverilator import PyVerilator +except ModuleNotFoundError: + PyVerilator = None from . import templates @@ -60,8 +67,8 @@ class MatrixVectorActivation_rtl(HLSCustomOp): """Class that corresponds to finn-hls Matrix_Vector_Activate(_Stream)_Batch function.""" - def __init__(self, onnx_node): - super().__init__(onnx_node) + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) self.decoupled_wrapper = templates.decoupled_wrapper def get_nodeattr_types(self): @@ -78,11 +85,6 @@ def get_nodeattr_types(self): "outputDataType": ("s", True, ""), # FINN DataType for accumulator -- auto-computed and updated "accDataType": ("s", False, "INT32"), - # use xnor-popcount for binary weights/inputs, thus treating them - # as bipolar - "binaryXnorMode": ("i", False, 0, {0, 1}), - # no-activation mode (produce accumulators) - "noActivation": ("i", False, 0, {0, 1}), # number of input vectors, examples: # [1] is a single vector (like a FC layer with batch=1) # [4] is four vectors (like a FC layer with batch=4) @@ -105,16 +107,6 @@ def get_nodeattr_types(self): "auto", {"auto", "block", "distributed", "ultra"}, ), - # FPGA resource type for threshold memories (if noActivation is False) - # auto -- let Vivado decide - # block -- use BRAM - # distributed -- use LUTRAM - "ram_style_thresholds": ( - "s", - False, - "auto", - {"auto", "block", "distributed"}, - ), # (mem_mode = decoupled only) whether weights will be writable through # an AXI-lite interface during runtime # 1 for enabled, 0 for disabled. @@ -125,6 +117,8 @@ def get_nodeattr_types(self): # vector through the accelerator. This will get rid of any old # weight data from the weight FIFOs. "runtime_writeable_weights": ("i", False, 0, {0, 1}), + # attribute to save top module name - not user configurable + "gen_top_module": ("s", False, ""), } my_attrs.update(super().get_nodeattr_types()) return my_attrs @@ -142,7 +136,6 @@ def calc_wmem(self): def calc_tmem(self): """Calculates and returns TMEM.""" - assert self.get_nodeattr("noActivation")==1, "RTL-based MVAU does not support thresholding currently, please infer a standalone Thresholding_Batch layer" return 0 def make_shape_compatible_op(self, model): @@ -192,27 +185,9 @@ def verify_node(self): """The required MatrixVectorActivation attributes do not exist.""" ) - # verify the number of inputs depending on noActivation value - # check noActivation value to determine the number of inputs - no_act = self.get_nodeattr("noActivation") - - if no_act == 1: - if len(self.onnx_node.input) == 2: - info_messages.append("The number of inputs is correct") - else: - info_messages.append( - """RTL-based MatrixVectorActivation needs in no - activation mode 2 inputs (data input and weights)""" - ) - elif no_act == 0: - info_messages.append("RTL-based MVAU does not support thresholding currently, please infer a standalone Thresholding_Batch layer") - else: - info_messages.append( - """noActivation attribute contains {} should - be 1 for RTL-based MatrixVectorActivation""".format( - no_act - ) - ) + num_of_inputs = len(self.onnx_node.input) + if num_of_inputs!=2: + info_messages.append("RTL-based MatrixVectorActivation expects two inputs (weights and activation), but got {} inputs.".format(len(self.onnx_node.input))) mem_mode = self.get_nodeattr("mem_mode") @@ -221,6 +196,7 @@ def verify_node(self): return info_messages +# TODO: Add in replay_buffer estimation def uram_estimation(self): P = self.get_nodeattr("PE") Q = self.get_nodeattr("SIMD") @@ -242,6 +218,7 @@ def uram_estimation(self): depth_multiplier = math.ceil(omega / 4096) return width_multiplier * depth_multiplier +# TODO: Add in replay_buffer estimation def bram_estimation(self): """Calculates resource estimation for BRAM based on: - FINN-R: An End-to-End Deep-Learning Framework for Fast @@ -268,7 +245,7 @@ def bram_estimation(self): ): return 0 # assuming SDP mode RAMB18s (see UG573 Table 1-10) - # assuming decoupled (RTL) memory, which is more efficient than const (HLS) + # assuming decoupled (RTL) memory if mem_width == 1: return math.ceil(omega / 16384) elif mem_width == 2: @@ -282,6 +259,7 @@ def bram_estimation(self): else: return (math.ceil(omega / 512)) * (math.ceil(mem_width / 36)) +# TODO: Add in replay_buffer estimation def bram_efficiency_estimation(self): wdt = self.get_weight_datatype() W = wdt.bitwidth() @@ -294,6 +272,7 @@ def bram_efficiency_estimation(self): bram16_est_capacity = bram16_est * 36 * 512 return wbits / bram16_est_capacity +# TODO: Add in replay_buffer estimation def uram_efficiency_estimation(self): """Function for URAM efficiency estimation: actual parameter storage needed divided by the allocated URAM storage (from estimation)""" @@ -308,7 +287,7 @@ def uram_efficiency_estimation(self): uram_est_capacity = uram_est * 72 * 4096 return wbits / uram_est_capacity -#TODO: FIX +#TODO: FIX: worst case estimates since segmentlen is not known at this point? def lut_estimation(self): """Calculates resource estimations for LUTs based on: - FINN-R: An End-to-End Deep-Learning Framework for Fast @@ -348,23 +327,14 @@ def lut_estimation(self): # accumulator acc_bits = W + A + np.ceil(math.log(MW, 2)) acc_luts = acc_bits - # thresholds and threshold comparators - thr_luts = 0 - comp_luts = 0 - noact = self.get_nodeattr("noActivation") - if noact == 0: - odt = self.get_output_datatype() - B = odt.bitwidth() - thr_luts = (2**B - 1) * acc_bits * math.ceil(self.calc_tmem() / 64) - comp_luts = (2**B - 1) * acc_bits return int( c0 - + c1 * (P * (mult_luts + addertree_luts + acc_luts + thr_luts + comp_luts)) + + c1 * (P * (mult_luts + addertree_luts + acc_luts)) + c2 ) -#TODO: FIX +#TODO: FIX: worst case estimates since segmentlen is not known at this point? def dsp_estimation(self): # multiplication P = self.get_nodeattr("PE") @@ -380,7 +350,7 @@ def dsp_estimation(self): mult_dsp = 0 return int(mult_dsp) -#TODO: FIX +#TODO: FIX: worst case estimates since segmentlen is not known at this point def get_exp_cycles(self): pe = self.get_nodeattr("PE") simd = self.get_nodeattr("SIMD") @@ -389,6 +359,7 @@ def get_exp_cycles(self): mw = self.get_nodeattr("MW") # since mmv != 1 is not supported yet, we set mmv for now to 1 mmv = 1 + # Actual exp_cycles is probably slightly larger (say 3 cycles (DSP A/B, M, P - reg) + additional pipeline buffer cycles. Most probably <10) exp_cycles = (mh / pe) * (mw / simd) * np.prod(num_inp_vec) / mmv return int(exp_cycles) @@ -413,7 +384,7 @@ def get_output_datatype(self, ind=0): def get_instream_width(self, ind=0): i_bits = self.get_input_datatype().bitwidth() - assert i_bits<=9, "RTL-based MVAU only supports activations with bit-width up to 9-bits" + assert (i_bits<=9), "RTL-based MVAU only supports activations with bit-width up to 9-bits" in_width = i_bits * self.get_nodeattr("SIMD") return in_width @@ -431,8 +402,8 @@ def get_weightstream_width(self): pe = self.get_nodeattr("PE") simd = self.get_nodeattr("SIMD") wp = self.get_weight_datatype().bitwidth() + assert (wp <= 8), "RTL-based MVAU only supports weights with bit-width up to 8-bits" w_width = pe * simd * wp - assert wp<=8, "RTL-based MVAU only supports weights with bit-width up to 8-bits" return w_width else: return 0 @@ -544,10 +515,8 @@ def minimize_accumulator_width(self, model): adt = DataType.get_smallest_possible(-acc_max - 1) else: adt = DataType.get_smallest_possible(acc_max) - # ensure a datatype divisible by 8-bits in case this is the last node - bw = roundup_to_integer_multiple(adt.bitwidth(), 8) - new_adt_name = adt.name.replace(str(adt.bitwidth()), str(bw)) - adt = DataType[new_adt_name] + # Note: we are interested in simply the width of the output dot product. + # Padding the actual output stream to a multiple of 8-bits is done in the RTL component self.set_nodeattr("accDataType", adt.name) # for no-activation nodes, output dt = acc dt self.set_nodeattr("outputDataType", adt.name) @@ -588,7 +557,10 @@ def make_weight_file(self, weights, weight_file_mode, weight_file_name): 1, -1, pe * simd ) weight_tensor_pe_flipped = weight_tensor_pe_flipped.copy() - if weight_file_mode == "decoupled_verilog_dat": + if weight_file_mode == "decoupled_npy": + # save weight stream into npy for cppsim + np.save(weight_file_name, weight_tensor_simd_flipped) + elif weight_file_mode == "decoupled_verilog_dat": # convert weight values into hexstring weight_width = self.get_weightstream_width() # pad to nearest 4 bits to get hex strings @@ -638,7 +610,7 @@ def generate_params(self, model, path): weight_filename_sim = "{}/weights.npy".format(code_gen_dir) # save decoupled weights for cppsim self.make_weight_file(weights, "decoupled_npy", weight_filename_sim) - # also save weights as Verilog .dat file + # Also save weights as Verilog .dat file # note that we provide two different .dat files, one for synth # and one for synthesis. this is because URAM-based weights always # need zero weights for synthesis, otherwise they get inferred @@ -693,7 +665,6 @@ def execute_node(self, context, graph): for inputs in node.input: # it is assumed that the first input of the node is the data input # the second input are the weights - # the third input are the thresholds if in_ind == 0: assert ( str(context[inputs].dtype) == "float32" @@ -709,7 +680,7 @@ def execute_node(self, context, graph): reshaped_input, ) elif in_ind > 2: - raise Exception("Unexpected input found for MatrixVectorActivation") + raise Exception("Unexpected input found for MatrixVectorActivation_rtl") in_ind += 1 if mode == "rtlsim": @@ -759,7 +730,7 @@ def execute_node(self, context, graph): def code_generation_ipgen(self, model, fpgapart, clk): """Normally: Generates C++ code and tcl script for IP generation. Here: Generates (System-)Verilog code for IP generation.""" - self.generate_hdl() + self.generate_hdl(model, fpgapart, clk) def ipgen_singlenode_code(self): """Normally: Builds the bash script for IP generation.""" @@ -828,11 +799,21 @@ def code_generation_ipi(self): "create_bd_intf_pin -mode Slave " "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, din_name) ) - # instantiate the hls ip - cmd.append( - "create_bd_cell -type ip -vlnv %s /%s/%s" - % (self.get_nodeattr("ip_vlnv"), node_name, node_name) - ) + # instantiate the RTL block + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/mvu/") + sourcefiles = [ + os.path.join(code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"), + rtllib_dir + "mvu_axi.sv", + rtllib_dir + "replay_buffer.sv", + rtllib_dir + "mvu_4sx4u.sv", + rtllib_dir + "mvu_8sx9.sv", + rtllib_dir + "mvu_8sx8u_dsp48.sv" + ] + for f in sourcefiles: + cmd.append("add_files -norecurse %s" % (f)) + cmd.append("create_bd_cell -type hier -reference %s /%s/%s" % (self.get_nodeattr("gen_top_module"), self.onnx_node.name, self.onnx_node.name)) + # instantiate a streamer and connect it to the HLS IP strm_vlnv = "xilinx.com:user:memstream:1.0" strm_inst = node_name + "_wstrm" @@ -947,12 +928,6 @@ def get_op_and_param_counts(self): weight_param_type = "param_weight_%db" % (weight_bits) weight_count = in_features * out_features ret_dict = {mac_op_type: mac_count, weight_param_type: weight_count} - if self.get_nodeattr("noActivation") == 0: - tdt = DataType[self.get_nodeattr("accDataType")] - thres_bits = tdt.bitwidth() - thres_param_type = "param_threshold_%db" % (thres_bits) - thres_count = out_features - ret_dict[thres_param_type] = thres_count return ret_dict def derive_characteristic_fxns(self, period): @@ -972,65 +947,113 @@ def derive_characteristic_fxns(self, period): ] super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict) - def generate_hdl(self): -#TODO: add distinction between (PE=MH or PE=1) and where MH dimension is folded - template_path, code_gen_dict = self.prepare_codegen_default() +# TODO: characterize max_clk and implement this function in look-up style + def _resolve_segment_len(self, clk): + # Insert pipeline registers in the DSP chain to meet target clock frequency + segmentlen = 0 + return segmentlen + + def _resolve_impl_style(self, fpgapart): + # Based on target device and activation/weight-width, choose the supported RTL module + act_width = self.get_input_datatype(0).bitwidth() + weight_width = self.get_input_datatype(1).bitwidth() + is_versal = fpgapart[0:4] in ["xcvc", "xcve", "xcvp", "xcvm", "xqvc", "xqvm"] or fpgapart[0:5] == "xqrvc" + if (act_width == 4 and weight_width == 4): + return "mvu_4sx4u" + else: + if (is_versal): + return "mvu_8sx9_dsp58" + else: + return "mvu_8sx8u_dsp48" + + def generate_hdl(self, model, fpgapart, clk): + # Generate params as part of IP preparation + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + self.generate_params(model, code_gen_dir) + template_path, code_gen_dict = self.prepare_codegen_default(fpgapart, clk) # add general parameters to dictionary - code_gen_dict["$TOP_MODULE_NAME$"] = [self.get_verilog_top_module_name()] + code_gen_dict["$MODULE_NAME_AXI_WRAPPER$"] = [self.get_verilog_top_module_name()] # save top module name so we can refer to it after this node has been renamed # (e.g. by GiveUniqueNodeNames(prefix) during MakeZynqProject) self.set_nodeattr("gen_top_module", self.get_verilog_top_module_name()) -#TODO: currently only ram_style=auto is supported + ram_style = self.get_nodeattr("ram_style") - if ram_style == "auto": - continue - else: - raise Exception("Unrecognized ram_style for MatrixVectorActivation") + assert (ram_style=="auto"), "Unrecognized ram_style for MatrixVectorActivation_rtl" - # apply code generation to templates - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + # apply code generation to template with open(template_path, "r") as f: - template = f.read() + template_wrapper = f.read() for key in code_gen_dict: # transform list into long string separated by '\n' code_gen_line = "\n".join(code_gen_dict[key]) - template = template.replace(key, code_gen_line) template_wrapper = template_wrapper.replace(key, code_gen_line) with open( os.path.join( - code_gen_dir, self.get_nodeattr("gen_top_module") + "_impl.sv" + code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v" ), "w", ) as f: - f.write(template) + f.write(template_wrapper.replace("$FORCE_BEHAVIORAL$", str(0))) with open( os.path.join( - code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v" + code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper_sim.v" ), "w", ) as f: - f.write(template_wrapper) + f.write(template_wrapper.replace("$FORCE_BEHAVIORAL$", str(1))) # set ipgen_path and ip_path so that HLS-Synth transformation # and stich_ip transformation do not complain self.set_nodeattr("ipgen_path", code_gen_dir) - self.set_nodeattr("ip_path", code_gen_dir) + self.set_nodeattr("ip_path", code_gen_dir) - def prepare_codegen_default(self): - # TODO: Differentiate between PE folding and fully unrolled along MH dimension + def prepare_codegen_default(self, fpgapart, clk): template_path = ( - os.environ["FINN_ROOT"] + "/finn-rtllib/mvau/dsp58_mvau_template.vhdl" + os.environ["FINN_ROOT"] + "/finn-rtllib/mvu/mvu_axi_wrapper.v" ) + code_gen_dict = {} - - code_gen_dict["$PE$"] = self.get_nodeattr("PE") - code_gen_dict["$SIMD$"] = self.get_nodeattr("SIMD") - code_gen_dict["$MW$"] = self.get_nodeattr("MW") - code_gen_dict["$MH$"] = self.get_nodeattr("MH") - code_gen_dict["$ACTIVATION_WIDTH$"] = self.get_input_datatype(0).bitwidth() - code_gen_dict["$WEIGHT_WIDTH$"] = self.get_input_datatype(1).bitwidth() - code_gen_dict["$ACCU_WIDTH_BA$"] = self.get_output_datatype().bitwidth() + code_gen_dict["$MW$"] = [str(self.get_nodeattr("MW"))] + code_gen_dict["$MH$"] = [str(self.get_nodeattr("MH"))] + code_gen_dict["$PE$"] = [str(self.get_nodeattr("PE"))] + code_gen_dict["$SIMD$"] = [str(self.get_nodeattr("SIMD"))] + code_gen_dict["$ACTIVATION_WIDTH$"] = [str(self.get_input_datatype(0).bitwidth())] + code_gen_dict["$WEIGHT_WIDTH$"] = [str(self.get_input_datatype(1).bitwidth())] + code_gen_dict["$ACCU_WIDTH$"] = [str(self.get_output_datatype().bitwidth())] + code_gen_dict["$SIGNED_ACTIVATIONS$"] = [str(1)] if (self.get_input_datatype(0).min() < 0) else [str(0)] + code_gen_dict["$SEGMENTLEN$"] = [str(self._resolve_segment_len(clk))] + code_gen_dict["$MVU_IMPL_STYLE$"] = [self._resolve_impl_style(fpgapart)] return template_path, code_gen_dict + def prepare_rtlsim(self): + """Creates a Verilator emulation library for the RTL code generated + for this node, sets the rtlsim_so attribute to its path and returns + a PyVerilator wrapper around it.""" + + if PyVerilator is None: + raise ImportError("Installation of PyVerilator is required.") + + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + # Path to (System-)Verilog files used by top-module & path to top-module + verilog_paths = [ + code_gen_dir, + os.environ["FINN_ROOT"] + "/finn-rtllib/mvu" + ] + verilog_files = [ + self.get_nodeattr("gen_top_module") + "_wrapper_sim.v" + ] + + # build the Verilator emu library + sim = PyVerilator.build( + verilog_files, + build_dir=make_build_dir("pyverilator_" + self.onnx_node.name + "_"), + verilog_path=verilog_paths, + trace_depth=get_rtlsim_trace_depth(), + top_module_name=self.get_verilog_top_module_name() + ) + # save generated lib filename in attribute + self.set_nodeattr("rtlsim_so", sim.lib._name) + + return sim \ No newline at end of file From 4e44934c3001174e52c62caf5d320104a308e611 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 8 May 2023 10:31:35 +0100 Subject: [PATCH 024/123] [tests]: initial version of unit test for RTL custom op and specialize_to_rtl transformation for MVU --- .../test_fpgadataflow_mvau_rtl.py | 172 ++++++++++++++++++ 1 file changed, 172 insertions(+) create mode 100644 tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py diff --git a/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py b/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py new file mode 100644 index 0000000000..20a249bd08 --- /dev/null +++ b/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py @@ -0,0 +1,172 @@ +# Copyright (C) 2022, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pytest +import os + +import numpy as np +from onnx import TensorProto, helper +from qonnx.util.basic import ( + qonnx_make_model, + gen_finn_dt_tensor +) +from qonnx.core.modelwrapper import ModelWrapper +from qonnx.core.datatype import DataType +from qonnx.transformation.general import GiveUniqueNodeNames +import finn.core.onnx_exec as oxe +import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls +from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP +from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim +from qonnx.transformation.general import ApplyConfig +import finn.transformation.fpgadataflow.specialize_to_rtl_layers as to_rtl +#import qonnx.core.data_layout as DataLayout + +build_dir = os.environ["FINN_BUILD_DIR"] + +def make_single_matmul_modelwrapper(W, ofm_shape, mh, ifm, weights, idt, wdt): + (ofm_h, ofm_w) = ofm_shape + ofm = helper.make_tensor_value_info( + "ofm", + TensorProto.FLOAT, + (1, ofm_h, ofm_w, mh) + ) + + matmul_node = helper.make_node( + "MatMul", + ["ifm", "weights"], + ["ofm"] + ) + graph = helper.make_graph( + nodes=[matmul_node], + name="matmul_graph", + inputs=[ifm], + outputs=[ofm] + ) + + model = qonnx_make_model(graph, producer_name="fclayer-model") + model = ModelWrapper(model) + + model.set_tensor_datatype("ifm", idt) + model.set_tensor_datatype("weights", wdt) + model.set_tensor_datatype("ofm", DataType["INT32"]) # At this step, the MatMul layer does not optimize the bit-width of the output datatype + model.set_initializer("weights", W) + + # model.set_tensor_layout("ifm", DataLayout.NHWC) + + return model + +def prepare_inputs(input_tensor): + return {"inp": input_tensor} + +@pytest.mark.parametrize("mh", [16]) +@pytest.mark.parametrize("mw", [90]) +#@pytest.mark.parametrize("pe", [1, 2, 4, 8, 16]) +@pytest.mark.parametrize("pe", [16]) +#@pytest.mark.parametrize("simd", [1, 30, 90]) +@pytest.mark.parametrize("simd", [90]) +@pytest.mark.parametrize("idt", [DataType["INT8"]]) +@pytest.mark.parametrize("wdt", [DataType["UINT4"]]) +#@pytest.mark.parametrize("part", ["xcvm1802-vsvd1760-2MP-e-S", "xcku3p-ffva676-1-e"]) +@pytest.mark.parametrize("part", ["xcvm1802-vsvd1760-2MP-e-S"]) +@pytest.mark.parametrize("segmentlen", [1]) +@pytest.mark.fpgadataflow +@pytest.mark.slow +@pytest.mark.vivado +def test_fpgadataflow_mvau_rtl(mh, mw, pe, simd, idt, wdt, part, segmentlen): + # Create test input vector (produced by SWG) + ofm_shape = (5, 5) + ofm_h, ofm_w = ofm_shape + ifm = helper.make_tensor_value_info( + "ifm", + TensorProto.FLOAT, + [1, ofm_h, ofm_w, mw] + ) + weights = helper.make_tensor_value_info( + "weights", + TensorProto.FLOAT, + [mw, mh] + ) + W = gen_finn_dt_tensor(wdt, (mw, mh)) + model = make_single_matmul_modelwrapper(W, ofm_shape, mh, ifm, weights, idt, wdt) + model = model.transform(GiveUniqueNodeNames()) + + model.save(build_dir+"/matmul.onnx") + + # Create MatMul & obtain golden reference output + A = gen_finn_dt_tensor(model.get_tensor_datatype("ifm"), model.get_tensor_shape("ifm")) + input_dict = prepare_inputs(A) + + ## Execute ONNX model + output_matmul = oxe.execute_onnx(model, input_dict) + + # Create MVAU (HLS) + model = model.transform(to_hls.InferQuantizedMatrixVectorActivation(mem_mode="decoupled")) + model = model.transform(GiveUniqueNodeNames()) + + # Apply folding (i.e. specify to use DSPs) + folding_config = { + "Defaults": {}, + "MatrixVectorActivation_0": { + "PE" : pe, + "SIMD" : simd, + "mem_mode" : "decoupled", + "ram_style" : "auto", + "resType" : "dsp", + "impl" : "rtl" + } + } + model = model.transform(ApplyConfig(folding_config)) + model.save(build_dir+"/mvau_hls.onnx") + + model = model.transform(SetExecMode("rtlsim")) + model = model.transform(PrepareIP(part, 5)) + model = model.transform(HLSSynthIP()) + model = model.transform(PrepareRTLSim()) + output_mvau_hls = oxe.execute_onnx(model, input_dict)["ofm"] + + # Apply convert-to-rtl step + model = model.transform(to_rtl.InferRTLMatrixVectorActivation()) + model = model.transform(GiveUniqueNodeNames()) + model.save(build_dir+"/mvau_rtl.onnx") + + model = model.transform(SetExecMode("rtlsim")) + model = model.transform(PrepareIP("xcvm1802-vsvd1760-2MP-e-S", 5)) + model = model.transform(HLSSynthIP()) + model = model.transform(PrepareRTLSim()) + output_mvau_rtl = oxe.execute_onnx(model, input_dict)["ofm"] + + model.save(build_dir+"/mvau_rtl_sim.onnx") + + assert (output_mvau_hls == output_mvau_rtl).all() + assert (output_mvau_hls.size > 0) + + +# python setup.py test --addopts "-k test_fpgadataflow_mvau_rtl" +# python setup.py test --addopts "-k test_fpgadataflow_fclayer_rtlsim" \ No newline at end of file From cc361d9fd4ea082e04d7a1a6bc3932406b0a4f14 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 8 May 2023 10:32:52 +0100 Subject: [PATCH 025/123] [rtl mvu]: specialized compute core for 4-bit weights and activations for DSP48/DSP58 --- finn-rtllib/mvu/mvu_4sx4u.sv | 359 +++++++++++++++++++++++++++++++++++ 1 file changed, 359 insertions(+) create mode 100644 finn-rtllib/mvu/mvu_4sx4u.sv diff --git a/finn-rtllib/mvu/mvu_4sx4u.sv b/finn-rtllib/mvu/mvu_4sx4u.sv new file mode 100644 index 0000000000..5993154355 --- /dev/null +++ b/finn-rtllib/mvu/mvu_4sx4u.sv @@ -0,0 +1,359 @@ +module mvu_4sx4u #( + int unsigned PE, + int unsigned SIMD, + int unsigned ACCU_WIDTH, + bit FORCE_BEHAVIORAL = 0 +)( + // Global Control + input logic clk, + input logic rst, + input logic en, + + // Input + input logic last, + input logic zero, // ignore current inputs and force this partial product to zero + input logic signed [PE-1:0][SIMD-1:0][3:0] w, // signed weights + input logic [SIMD-1:0][3:0] a, // unsigned activations + + // Ouput + output logic vld, + output logic signed [PE-1:0][ACCU_WIDTH-1:0] p +); + + typedef int unsigned leave_load_t[2*SIMD-1]; + function leave_load_t init_leave_loads(); + automatic leave_load_t res; + for(int i = 2*(SIMD-1); i >= int'(SIMD)-1; i--) res[i] = 1; + for(int i = SIMD-2; i >= 0; i--) res[i] = res[2*i+1] + res[2*i+2]; + return res; + endfunction : init_leave_loads + + // Pipeline for last indicator flag + logic [1:5] L = '0; + always_ff @(posedge clk) begin + if(rst) L <= '0; + else if(en) L <= { last, L[1:4] }; + end + assign vld = L[5]; + + // Stages #1 - #3: DSP Lanes + cross-lane canaries duplicated with SIMD parallelism + localparam int unsigned D[4:0] = '{ ACCU_WIDTH+22, 22, 15, 8, 0 }; // Lane offsets + + localparam int unsigned PIPE_COUNT = (PE+3)/4; + for(genvar c = 0; c < PIPE_COUNT; c++) begin : genPipes + + localparam int unsigned PE_BEG = 4*c; + localparam int unsigned PE_END = PE < 4*(c+1)? PE : 4*(c+1); + + uwire [57:0] p3[SIMD]; + uwire signed [ 1:0] h3[SIMD][3]; + for(genvar s = 0; s < SIMD; s++) begin : genSIMD + + // Input Lane Assembly + uwire [23:0] bb = a[s]; + logic [33:0] aa; + logic [26:0] dd; + logic [ 1:0] xx[3:1]; + if(1) begin : blkVectorize + uwire [3:0] ww[PE_END - PE_BEG]; + for(genvar pe = 0; pe < PE_END - PE_BEG; pe++) begin + assign ww[pe] = w[PE_BEG + pe][s]; + if(pe) begin +// assign xx[pe] = zero? 0 : ww[pe] * a[s]; + LUT6_2 #(.INIT(64'h0000_6AC0_0000_8888)) lut_x ( + .O6(xx[pe][1]), + .O5(xx[pe][0]), + .I5(1'b1), + .I4(zero), + .I3(ww[pe][1]), + .I2(a[s][1]), + .I1(ww[pe][0]), + .I0(a[s][0]) + ); + end + end + always_comb begin + dd = '0; + aa = '0; + for(int unsigned pe = 0; pe < PE_END - PE_BEG; pe++) begin + dd[D[pe]+:3] = ww[pe]; + aa[D[pe]+ 3] = ww[pe][3]; + end + end + end : blkVectorize + + uwire [57:0] pp; + + // Note: Since the product B * AD is computed, + // rst can be only applied to AD and zero only to B + // with the same effect as zeroing both. + if (FORCE_BEHAVIORAL) begin : genBehav + // Stage #1: Input Refine + logic signed [23:0] B1 = 0; + always_ff @(posedge clk) begin + if(zero) B1 <= 0; + else if(en) B1 <= bb; + end + + logic signed [26:0] AD1 = 0; + always_ff @(posedge clk) begin + if(rst) AD1 <= 0; + else if(en) AD1 <= dd - aa; + end + + // Stage #2: Multiply + logic signed [50:0] M2 = 0; + always_ff @(posedge clk) begin + if(rst) M2 <= 0; + else if(en) M2 <= +// synthesis translate off + (B1 === '0) || (AD1 === '0)? 0 : +// synthesis translate on + B1 * AD1; + end + + // Stage #3: Accumulate + logic signed [57:0] P3 = 0; + always_ff @(posedge clk) begin + if(rst) P3 <= 0; + else if(en) P3 <= M2 + (L[3]? 0 : P3); + end + + assign pp = P3; + end : genBehav + else begin : genDSP + DSP48E2 #( + // Feature Control Attributes: Data Path Selection + .AMULTSEL("AD"), // Selects A input to multiplier (A, AD) + .A_INPUT("DIRECT"), // Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port) + .BMULTSEL("B"), // Selects B input to multiplier (AD, B) + .B_INPUT("DIRECT"), // Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port) + .PREADDINSEL("A"), // Selects input to pre-adder (A, B) + .RND('0), // Rounding Constant + .USE_MULT("MULTIPLY"), // Select multiplier usage (DYNAMIC, MULTIPLY, NONE) + .USE_SIMD("ONE48"), // SIMD selection (FOUR12, ONE58, TWO24) + .USE_WIDEXOR("FALSE"), // Use the Wide XOR function (FALSE, TRUE) + .XORSIMD("XOR24_48_96"), // Mode of operation for the Wide XOR (XOR12_22, XOR24_34_58_116) + + // Pattern Detector Attributes: Pattern Detection Configuration + .AUTORESET_PATDET("NO_RESET"), // NO_RESET, RESET_MATCH, RESET_NOT_MATCH + .AUTORESET_PRIORITY("RESET"), // Priority of AUTORESET vs. CEP (CEP, RESET). + .MASK('1), // 58-bit mask value for pattern detect (1=ignore) + .PATTERN('0), // 58-bit pattern match for pattern detect + .SEL_MASK("MASK"), // C, MASK, ROUNDING_MODE1, ROUNDING_MODE2 + .SEL_PATTERN("PATTERN"), // Select pattern value (C, PATTERN) + .USE_PATTERN_DETECT("NO_PATDET"), // Enable pattern detect (NO_PATDET, PATDET) + + // Programmable Inversion Attributes: Specifies built-in programmable inversion on specific pins + .IS_ALUMODE_INVERTED('0), // Optional inversion for ALUMODE + .IS_CARRYIN_INVERTED('0), // Optional inversion for CARRYIN + .IS_CLK_INVERTED('0), // Optional inversion for CLK + .IS_INMODE_INVERTED('0), // Optional inversion for INMODE + .IS_OPMODE_INVERTED(9'b00_010_01_01), // Optional inversion for OPMODE + .IS_RSTALLCARRYIN_INVERTED('0), // Optional inversion for RSTALLCARRYIN + .IS_RSTALUMODE_INVERTED('0), // Optional inversion for RSTALUMODE + .IS_RSTA_INVERTED('0), // Optional inversion for RSTA + .IS_RSTB_INVERTED('0), // Optional inversion for RSTB + .IS_RSTCTRL_INVERTED('0), // Optional inversion for STCONJUGATE_A + .IS_RSTC_INVERTED('0), // Optional inversion for RSTC + .IS_RSTD_INVERTED('0), // Optional inversion for RSTD + .IS_RSTINMODE_INVERTED('0), // Optional inversion for RSTINMODE + .IS_RSTM_INVERTED('0), // Optional inversion for RSTM + .IS_RSTP_INVERTED('0), // Optional inversion for RSTP + + // Register Control Attributes: Pipeline Register Configuration + .ACASCREG(0), // Number of pipeline stages between A/ACIN and ACOUT (0-2) + .ADREG(1), // Pipeline stages for pre-adder (0-1) + .ALUMODEREG(0), // Pipeline stages for ALUMODE (0-1) + .AREG(0), // Pipeline stages for A (0-2) + .BCASCREG(1), // Number of pipeline stages between B/BCIN and BCOUT (0-2) + .BREG(1), // Pipeline stages for B (0-2) + .CARRYINREG(0), // Pipeline stages for CARRYIN (0-1) + .CARRYINSELREG(0), // Pipeline stages for CARRYINSEL (0-1) + .CREG(0), // Pipeline stages for C (0-1) + .DREG(0), // Pipeline stages for D (0-1) + .INMODEREG(0), // Pipeline stages for INMODE (0-1) + .MREG(1), // Multiplier pipeline stages (0-1) + .OPMODEREG(1), // Pipeline stages for OPMODE (0-1) + .PREG(1) // Number of pipeline stages for P (0-1) + ) dsp ( + // Cascade outputs: Cascade Ports + .ACOUT(), // 34-bit output: A port cascade + .BCOUT(), // 24-bit output: B cascade + .CARRYCASCOUT(), // 1-bit output: Cascade carry + .MULTSIGNOUT(), // 1-bit output: Multiplier sign cascade + .PCOUT(), // 58-bit output: Cascade output + + // Control outputs: Control Inputs/Status Bits + .OVERFLOW(), // 1-bit output: Overflow in add/acc + .PATTERNBDETECT(), // 1-bit output: Pattern bar detect + .PATTERNDETECT(), // 1-bit output: Pattern detect + .UNDERFLOW(), // 1-bit output: Underflow in add/acc + + // Data outputs: Data Ports + .CARRYOUT(), // 4-bit output: Carry + .P(pp), // 58-bit output: Primary data + .XOROUT(), // 8-bit output: XOR data + + // Cascade inputs: Cascade Ports + .ACIN('x), // 34-bit input: A cascade data + .BCIN('x), // 24-bit input: B cascade + .CARRYCASCIN('x), // 1-bit input: Cascade carry + .MULTSIGNIN('x), // 1-bit input: Multiplier sign cascade + .PCIN('x), // 58-bit input: P cascade + + // Control inputs: Control Inputs/Status Bits + .CLK(clk), // 1-bit input: Clock + .ALUMODE(4'h0), // 4-bit input: ALU control + .CARRYINSEL('0), // 3-bit input: Carry select + .INMODE(5'b01100), // 5-bit input: INMODE control + .OPMODE({ 2'b00, { 1'b0, L[2], 1'b0 }, 4'b00_00 }), // 9-bit input: Operation mode + + // Data inputs: Data Ports + .A(aa), // 34-bit input: A data + .B(bb), // 24-bit input: B data + .C('x), // 58-bit input: C data + .CARRYIN('0), // 1-bit input: Carry-in + .D(dd), // 27-bit input: D data + + // Reset/Clock Enable inputs: Reset/Clock Enable Inputs + .CEA1('0), // 1-bit input: Clock enable for 1st stage AREG + .CEA2('0), // 1-bit input: Clock enable for 2nd stage AREG + .CEAD(en), // 1-bit input: Clock enable for ADREG + .CEALUMODE('0), // 1-bit input: Clock enable for ALUMODE + .CEB1('0), // 1-bit input: Clock enable for 1st stage BREG + .CEB2(en), // 1-bit input: Clock enable for 2nd stage BREG + .CEC('0), // 1-bit input: Clock enable for CREG + .CECARRYIN('0), // 1-bit input: Clock enable for CARRYINREG + .CECTRL(en), // 1-bit input: Clock enable for OPMODEREG and CARRYINSELREG + .CED('0), // 1-bit input: Clock enable for DREG + .CEINMODE('0), // 1-bit input: Clock enable for INMODEREG + .CEM(en), // 1-bit input: Clock enable for MREG + .CEP(en), // 1-bit input: Clock enable for PREG + .RSTA('0), // 1-bit input: Reset for AREG + .RSTB( // 1-bit input: Reset for BREG +// synthesis translate_off + rst || +// synthesis translate_on + zero + ), + .RSTC('0), // 1-bit input: Reset for CREG + .RSTD( // 1-bit input: Reset for DREG and ADREG +// synthesis translate_off + zero || +// synthesis translate_on + rst + ), + .RSTALLCARRYIN('0), // 1-bit input: Reset for CARRYINREG + .RSTALUMODE('0), // 1-bit input: Reset for ALUMODEREG + .RSTCTRL('0), // 1-bit input: Reset for OPMODEREG and CARRYINSELREG + .RSTINMODE('0), // 1-bit input: Reset for INMODE register + .RSTM(rst), // 1-bit input: Reset for MREG + .RSTP(rst) // 1-bit input: Reset for PREG + ); + end : genDSP + + // External Canary Pipeline + logic [1:0] X1[3:1] = '{ default: 0 }; + logic [1:0] X2[3:1] = '{ default: 0 }; + logic [1:0] X3[3:1] = '{ default: 0 }; + always_ff @(posedge clk) begin + if(rst) begin + X1 <= '{ default: 0 }; + X2 <= '{ default: 0 }; + X3 <= '{ default: 0 }; + end + else if(en) begin + X1 <= xx; + X2 <= X1; + foreach(X3[i]) begin + X3[i] <= X2[i] + (L[3]? 2'h0 : pp[D[i]+:2]); + end + end + end + + // Derive actual cross-lane overflows + for(genvar i = 0; i < 3; i++) begin + assign h3[s][i] = pp[D[i+1]+:2] - X3[i+1]; + end + assign p3[s] = pp; + + end : genSIMD + + // Stage #4: Cross-SIMD Reduction + + // Count leaves reachable from each node + localparam leave_load_t LEAVE_LOAD = init_leave_loads(); + + uwire signed [ACCU_WIDTH -1:0] up4; + uwire signed [ACCU_WIDTH -8:0] hi4[3]; + uwire [$clog2(SIMD)+7:0] lo4[3]; + for(genvar i = 0; i < 4; i++) begin + localparam int unsigned LO_WIDTH = D[i+1] - D[i]; + localparam int unsigned HI_WIDTH = ACCU_WIDTH - LO_WIDTH; + + // Conclusive high part accumulation + if(i < 3) begin : genHi + // Adder Tree across all SIMD high contributions, each from [-1:1] + uwire signed [$clog2(1+SIMD):0] tree[2*SIMD-1]; + for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = h3[s][i]; + for(genvar n = 0; n < SIMD-1; n++) begin + // Sum truncated to actual maximum bit width at this node + uwire signed [$clog2(1+LEAVE_LOAD[n]):0] s = tree[2*n+1] + tree[2*n+2]; + assign tree[n] = s; + end + + // High Sideband Accumulation + logic signed [HI_WIDTH-1:0] Hi4 = 0; + always_ff @(posedge clk) begin + if(rst) Hi4 <= 0; + else if(en) Hi4 <= (L[4]? 0 : Hi4) + tree[0]; + end + assign hi4[i] = Hi4; + end : genHi + + // Conclusive low part accumulation + if(1) begin : blkLo + // Adder Tree across all SIMD low contributions + localparam int unsigned ROOT_WIDTH = $clog2(1 + SIMD*(2**LO_WIDTH-1)); + uwire [ROOT_WIDTH-1:0] tree[2*SIMD-1]; + for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = p3[s][D[i]+:LO_WIDTH]; + for(genvar n = 0; n < SIMD-1; n++) begin + // Sum truncated to actual maximum bit width at this node + localparam int unsigned NODE_WIDTH = $clog2(1 + LEAVE_LOAD[n]*(2**LO_WIDTH-1)); + uwire [NODE_WIDTH-1:0] s = tree[2*n+1] + tree[2*n+2]; + assign tree[n] = s; + end + + logic [ROOT_WIDTH-1:0] Lo4 = 0; + always_ff @(posedge clk) begin + if(rst) Lo4 <= 0; + else if(en) Lo4 <= tree[0]; + end + + if(i == 3) assign up4 = Lo4; + else assign lo4[i] = Lo4; + end : blkLo + + end + + // Stage #5: Resolve lane totals + logic signed [3:0][ACCU_WIDTH-1:0] Res5 = '{ default: 0 }; + always_ff @(posedge clk) begin + if(rst) Res5 <= '{ default: 0 }; + else if(en) begin + Res5[3] <= up4 - hi4[2]; + Res5[2] <= $signed({ hi4[2], {(D[3] - D[2]){1'b0}} }) + $signed({ 1'b0, lo4[2] }) - hi4[1]; + Res5[1] <= $signed({ hi4[1], {(D[2] - D[1]){1'b0}} }) + $signed({ 1'b0, lo4[1] }) - hi4[0]; + Res5[0] <= $signed({ hi4[0], {(D[1] - D[0]){1'b0}} }) + $signed({ 1'b0, lo4[0] }); + end + end + + // Output + for(genvar pe = PE_BEG; pe < PE_END; pe++) begin + assign p[pe] = Res5[pe - PE_BEG]; + end + + end : genPipes + +endmodule : mvu_4sx4u \ No newline at end of file From 8eefb535c3da6482f95465df05b8d3e1c610be21 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 8 May 2023 10:33:31 +0100 Subject: [PATCH 026/123] [rtl mvu]: specialized compute core for > 4-bit weights and activations for DSP48 --- finn-rtllib/mvu/mvu_8sx8u_dsp48.sv | 358 +++++++++++++++++++++++++++++ 1 file changed, 358 insertions(+) create mode 100644 finn-rtllib/mvu/mvu_8sx8u_dsp48.sv diff --git a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv new file mode 100644 index 0000000000..e06a92c8fa --- /dev/null +++ b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv @@ -0,0 +1,358 @@ +module mvu_8sx8u_dsp48 #( + int unsigned PE, + int unsigned SIMD, + int unsigned ACCU_WIDTH, + int unsigned ACTIVATION_WIDTH, + int unsigned WEIGHT_WIDTH, + bit FORCE_BEHAVIORAL = 0, + + localparam int unsigned SINGLE_PROD_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH +)( + // Global Control + input logic clk, + input logic rst, + input logic en, + + // Input + input logic last, + input logic zero, // ignore current inputs and force this partial product to zero + input logic signed [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] w, // signed weights + input logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] a, // unsigned activations + + // Ouput + output logic vld, + output logic signed [PE-1:0][ACCU_WIDTH-1:0] p +); + + typedef int unsigned leave_load_t[2*SIMD-1]; + function leave_load_t init_leave_loads(); + automatic leave_load_t res; + for(int i = 2*(SIMD-1); i >= int'(SIMD)-1; i--) res[i] = 1; + for(int i = SIMD-2; i >= 0; i--) res[i] = res[2*i+1] + res[2*i+2]; + return res; + endfunction : init_leave_loads + + // Pipeline for last indicator flag + logic [1:5] L = '0; + always_ff @(posedge clk) begin + if(rst) L <= '0; + else if(en) L <= { last, L[1:4] }; + end + assign vld = L[5]; + + // Stages #1 - #3: DSP Lanes + cross-lane canaries duplicated with SIMD parallelism + localparam int unsigned D[2:0] = '{ ACCU_WIDTH+SINGLE_PROD_WIDTH, SINGLE_PROD_WIDTH, 0 }; // Lane offsets + + localparam int unsigned PIPE_COUNT = (PE+1)/2; + for(genvar c = 0; c < PIPE_COUNT; c++) begin : genPipes + + localparam int unsigned PE_BEG = 2*c; + localparam int unsigned PE_END = PE < 2*(c+1)? PE : 2*(c+1); + + uwire [57:0] p3[SIMD]; + uwire signed [ 1:0] h3[SIMD]; + for(genvar s = 0; s < SIMD; s++) begin : genSIMD + + // Input Lane Assembly + uwire [23:0] bb = a[s]; + logic [33:0] aa; + logic [26:0] dd; + logic [ 1:0] xx; + if(1) begin : blkVectorize + uwire [WEIGHT_WIDTH-1:0] ww[PE_END - PE_BEG]; + for(genvar pe = 0; pe < PE_END - PE_BEG; pe++) begin + assign ww[pe] = w[PE_BEG + pe][s]; + if(pe) begin +// assign xx[pe] = zero? 0 : ww[pe] * a[s]; + LUT6_2 #(.INIT(64'h0000_6AC0_0000_8888)) lut_x ( + .O6(xx[1]), + .O5(xx[0]), + .I5(1'b1), + .I4(zero), + .I3(ww[pe][1]), + .I2(a[s][1]), + .I1(ww[pe][0]), + .I0(a[s][0]) + ); + end + end + always_comb begin + dd = '0; + aa = '0; + for(int unsigned pe = 0; pe < PE_END - PE_BEG; pe++) begin + dd[D[pe] +: WEIGHT_WIDTH-1] = ww[pe]; + aa[D[pe] + WEIGHT_WIDTH-1] = ww[pe][WEIGHT_WIDTH-1]; + end + end + end : blkVectorize + + uwire [57:0] pp; + + // Note: Since the product B * AD is computed, + // rst can be only applied to AD and zero only to B + // with the same effect as zeroing both. + if (FORCE_BEHAVIORAL) begin : genBehav + // Stage #1: Input Refine + logic signed [23:0] B1 = 0; + always_ff @(posedge clk) begin + if(zero) B1 <= 0; + else if(en) B1 <= bb; + end + + logic signed [26:0] AD1 = 0; + always_ff @(posedge clk) begin + if(rst) AD1 <= 0; + else if(en) AD1 <= dd - aa; + end + + // Stage #2: Multiply + logic signed [50:0] M2 = 0; + always_ff @(posedge clk) begin + if(rst) M2 <= 0; + else if(en) M2 <= +// synthesis translate off + (B1 === '0) || (AD1 === '0)? 0 : +// synthesis translate on + B1 * AD1; + end + + // Stage #3: Accumulate + logic signed [57:0] P3 = 0; + always_ff @(posedge clk) begin + if(rst) P3 <= 0; + else if(en) P3 <= M2 + (L[3]? 0 : P3); + end + + assign pp = P3; + end : genBehav + else begin : genDSP + DSP48E2 #( + // Feature Control Attributes: Data Path Selection + .AMULTSEL("AD"), // Selects A input to multiplier (A, AD) + .A_INPUT("DIRECT"), // Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port) + .BMULTSEL("B"), // Selects B input to multiplier (AD, B) + .B_INPUT("DIRECT"), // Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port) + .PREADDINSEL("A"), // Selects input to pre-adder (A, B) + .RND('0), // Rounding Constant + .USE_MULT("MULTIPLY"), // Select multiplier usage (DYNAMIC, MULTIPLY, NONE) + .USE_SIMD("ONE48"), // SIMD selection (FOUR12, ONE58, TWO24) + .USE_WIDEXOR("FALSE"), // Use the Wide XOR function (FALSE, TRUE) + .XORSIMD("XOR24_48_96"), // Mode of operation for the Wide XOR (XOR12_22, XOR24_34_58_116) + + // Pattern Detector Attributes: Pattern Detection Configuration + .AUTORESET_PATDET("NO_RESET"), // NO_RESET, RESET_MATCH, RESET_NOT_MATCH + .AUTORESET_PRIORITY("RESET"), // Priority of AUTORESET vs. CEP (CEP, RESET). + .MASK('1), // 58-bit mask value for pattern detect (1=ignore) + .PATTERN('0), // 58-bit pattern match for pattern detect + .SEL_MASK("MASK"), // C, MASK, ROUNDING_MODE1, ROUNDING_MODE2 + .SEL_PATTERN("PATTERN"), // Select pattern value (C, PATTERN) + .USE_PATTERN_DETECT("NO_PATDET"), // Enable pattern detect (NO_PATDET, PATDET) + + // Programmable Inversion Attributes: Specifies built-in programmable inversion on specific pins + .IS_ALUMODE_INVERTED('0), // Optional inversion for ALUMODE + .IS_CARRYIN_INVERTED('0), // Optional inversion for CARRYIN + .IS_CLK_INVERTED('0), // Optional inversion for CLK + .IS_INMODE_INVERTED('0), // Optional inversion for INMODE + .IS_OPMODE_INVERTED(9'b00_010_01_01), // Optional inversion for OPMODE + .IS_RSTALLCARRYIN_INVERTED('0), // Optional inversion for RSTALLCARRYIN + .IS_RSTALUMODE_INVERTED('0), // Optional inversion for RSTALUMODE + .IS_RSTA_INVERTED('0), // Optional inversion for RSTA + .IS_RSTB_INVERTED('0), // Optional inversion for RSTB + .IS_RSTCTRL_INVERTED('0), // Optional inversion for STCONJUGATE_A + .IS_RSTC_INVERTED('0), // Optional inversion for RSTC + .IS_RSTD_INVERTED('0), // Optional inversion for RSTD + .IS_RSTINMODE_INVERTED('0), // Optional inversion for RSTINMODE + .IS_RSTM_INVERTED('0), // Optional inversion for RSTM + .IS_RSTP_INVERTED('0), // Optional inversion for RSTP + + // Register Control Attributes: Pipeline Register Configuration + .ACASCREG(0), // Number of pipeline stages between A/ACIN and ACOUT (0-2) + .ADREG(1), // Pipeline stages for pre-adder (0-1) + .ALUMODEREG(0), // Pipeline stages for ALUMODE (0-1) + .AREG(0), // Pipeline stages for A (0-2) + .BCASCREG(1), // Number of pipeline stages between B/BCIN and BCOUT (0-2) + .BREG(1), // Pipeline stages for B (0-2) + .CARRYINREG(0), // Pipeline stages for CARRYIN (0-1) + .CARRYINSELREG(0), // Pipeline stages for CARRYINSEL (0-1) + .CREG(0), // Pipeline stages for C (0-1) + .DREG(0), // Pipeline stages for D (0-1) + .INMODEREG(0), // Pipeline stages for INMODE (0-1) + .MREG(1), // Multiplier pipeline stages (0-1) + .OPMODEREG(1), // Pipeline stages for OPMODE (0-1) + .PREG(1) // Number of pipeline stages for P (0-1) + ) dsp ( + // Cascade outputs: Cascade Ports + .ACOUT(), // 34-bit output: A port cascade + .BCOUT(), // 24-bit output: B cascade + .CARRYCASCOUT(), // 1-bit output: Cascade carry + .MULTSIGNOUT(), // 1-bit output: Multiplier sign cascade + .PCOUT(), // 58-bit output: Cascade output + + // Control outputs: Control Inputs/Status Bits + .OVERFLOW(), // 1-bit output: Overflow in add/acc + .PATTERNBDETECT(), // 1-bit output: Pattern bar detect + .PATTERNDETECT(), // 1-bit output: Pattern detect + .UNDERFLOW(), // 1-bit output: Underflow in add/acc + + // Data outputs: Data Ports + .CARRYOUT(), // 4-bit output: Carry + .P(pp), // 58-bit output: Primary data + .XOROUT(), // 8-bit output: XOR data + + // Cascade inputs: Cascade Ports + .ACIN('x), // 34-bit input: A cascade data + .BCIN('x), // 24-bit input: B cascade + .CARRYCASCIN('x), // 1-bit input: Cascade carry + .MULTSIGNIN('x), // 1-bit input: Multiplier sign cascade + .PCIN('x), // 58-bit input: P cascade + + // Control inputs: Control Inputs/Status Bits + .CLK(clk), // 1-bit input: Clock + .ALUMODE(4'h0), // 4-bit input: ALU control + .CARRYINSEL('0), // 3-bit input: Carry select + .INMODE(5'b01100), // 5-bit input: INMODE control + .OPMODE({ 2'b00, { 1'b0, L[2], 1'b0 }, 4'b00_00 }), // 9-bit input: Operation mode + + // Data inputs: Data Ports + .A(aa), // 34-bit input: A data + .B(bb), // 24-bit input: B data + .C('x), // 58-bit input: C data + .CARRYIN('0), // 1-bit input: Carry-in + .D(dd), // 27-bit input: D data + + // Reset/Clock Enable inputs: Reset/Clock Enable Inputs + .CEA1('0), // 1-bit input: Clock enable for 1st stage AREG + .CEA2('0), // 1-bit input: Clock enable for 2nd stage AREG + .CEAD(en), // 1-bit input: Clock enable for ADREG + .CEALUMODE('0), // 1-bit input: Clock enable for ALUMODE + .CEB1('0), // 1-bit input: Clock enable for 1st stage BREG + .CEB2(en), // 1-bit input: Clock enable for 2nd stage BREG + .CEC('0), // 1-bit input: Clock enable for CREG + .CECARRYIN('0), // 1-bit input: Clock enable for CARRYINREG + .CECTRL(en), // 1-bit input: Clock enable for OPMODEREG and CARRYINSELREG + .CED('0), // 1-bit input: Clock enable for DREG + .CEINMODE('0), // 1-bit input: Clock enable for INMODEREG + .CEM(en), // 1-bit input: Clock enable for MREG + .CEP(en), // 1-bit input: Clock enable for PREG + .RSTA('0), // 1-bit input: Reset for AREG + .RSTB( // 1-bit input: Reset for BREG +// synthesis translate_off + rst || +// synthesis translate_on + zero + ), + .RSTC('0), // 1-bit input: Reset for CREG + .RSTD( // 1-bit input: Reset for DREG and ADREG +// synthesis translate_off + zero || +// synthesis translate_on + rst + ), + .RSTALLCARRYIN('0), // 1-bit input: Reset for CARRYINREG + .RSTALUMODE('0), // 1-bit input: Reset for ALUMODEREG + .RSTCTRL('0), // 1-bit input: Reset for OPMODEREG and CARRYINSELREG + .RSTINMODE('0), // 1-bit input: Reset for INMODE register + .RSTM(rst), // 1-bit input: Reset for MREG + .RSTP(rst) // 1-bit input: Reset for PREG + ); + end : genDSP + + // External Canary Pipeline + logic [1:0] X1 = '{ default: 0 }; + logic [1:0] X2 = '{ default: 0 }; + logic [1:0] X3 = '{ default: 0 }; + always_ff @(posedge clk) begin + if(rst) begin + X1 <= '{ default: 0 }; + X2 <= '{ default: 0 }; + X3 <= '{ default: 0 }; + end + else if(en) begin + X1 <= xx; + X2 <= X1; + X3 <= X2 + (L[3]? 2'h0 : pp[D[1]+:2]); + end + end + + // Derive actual cross-lane overflows + assign h3[s] = pp[D[1]+:2] - X3; + + assign p3[s] = pp; + + end : genSIMD + + // Stage #4: Cross-SIMD Reduction + + // Count leaves reachable from each node + localparam leave_load_t LEAVE_LOAD = init_leave_loads(); + + uwire signed [ACCU_WIDTH -1:0] up4; + uwire signed [ACCU_WIDTH -SINGLE_PROD_WIDTH:0] hi4; + uwire [$clog2(SIMD)+SINGLE_PROD_WIDTH-1:0] lo4; + for(genvar i = 0; i < 2; i++) begin + localparam int unsigned LO_WIDTH = D[i+1] - D[i]; + localparam int unsigned HI_WIDTH = ACCU_WIDTH - LO_WIDTH; + + // Conclusive high part accumulation + if(i == 0) begin : genHi + // Adder Tree across all SIMD high contributions, each from [-1:1] + uwire signed [$clog2(1+SIMD):0] tree[2*SIMD-1]; + for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = h3[s]; + for(genvar n = 0; n < SIMD-1; n++) begin + // Sum truncated to actual maximum bit width at this node + uwire signed [$clog2(1+LEAVE_LOAD[n]):0] s = tree[2*n+1] + tree[2*n+2]; + assign tree[n] = s; + end + + // High Sideband Accumulation + logic signed [HI_WIDTH-1:0] Hi4 = 0; + always_ff @(posedge clk) begin + if(rst) Hi4 <= 0; + else if(en) Hi4 <= (L[4]? 0 : Hi4) + tree[0]; + end + assign hi4 = Hi4; + end : genHi + + // Conclusive low part accumulation + if(1) begin : blkLo + // Adder Tree across all SIMD low contributions + localparam int unsigned ROOT_WIDTH = $clog2(1 + SIMD*(2**LO_WIDTH-1)); + uwire [ROOT_WIDTH-1:0] tree[2*SIMD-1]; + for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = p3[s][D[i]+:LO_WIDTH]; + for(genvar n = 0; n < SIMD-1; n++) begin + // Sum truncated to actual maximum bit width at this node + localparam int unsigned NODE_WIDTH = $clog2(1 + LEAVE_LOAD[n]*(2**LO_WIDTH-1)); + uwire [NODE_WIDTH-1:0] s = tree[2*n+1] + tree[2*n+2]; + assign tree[n] = s; + end + + logic [ROOT_WIDTH-1:0] Lo4 = 0; + always_ff @(posedge clk) begin + if(rst) Lo4 <= 0; + else if(en) Lo4 <= tree[0]; + end + + if(i == 1) assign up4 = Lo4; + else assign lo4 = Lo4; + end : blkLo + + end + + // Stage #5: Resolve lane totals + logic signed [1:0][ACCU_WIDTH-1:0] Res5 = '{ default: 0 }; + always_ff @(posedge clk) begin + if(rst) Res5 <= '{ default: 0 }; + else if(en) begin + Res5[1] <= up4 - hi4; + Res5[0] <= $signed({ hi4, {(D[1] - D[0]){1'b0}} }) + $signed({ 1'b0, lo4 }); + end + end + + // Output + for(genvar pe = PE_BEG; pe < PE_END; pe++) begin + assign p[pe] = Res5[pe - PE_BEG]; + end + + end : genPipes + +endmodule : mvu_8sx8u_dsp48 \ No newline at end of file From e7109e75161774280b24e5884f6c9b9c17a07f7b Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 8 May 2023 10:34:23 +0100 Subject: [PATCH 027/123] [fpgadataflow transform]: initial specialize_to_rtl_layers-transform for MVU --- .../fpgadataflow/specialize_to_rtl_layers.py | 105 ++++++++++++++++++ 1 file changed, 105 insertions(+) create mode 100644 src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py diff --git a/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py b/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py new file mode 100644 index 0000000000..7d677ec216 --- /dev/null +++ b/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py @@ -0,0 +1,105 @@ +# Copyright (c) 2023, AMD +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from qonnx.transformation.base import Transformation +from qonnx.custom_op.registry import getCustomOp +from qonnx.core.datatype import DataType +from onnx import helper +from qonnx.transformation.infer_shapes import InferShapes +from qonnx.transformation.infer_datatypes import InferDataTypes +from finn.transformation.fpgadataflow.minimize_accumulator_width import MinimizeAccumulatorWidth + +class InferRTLMatrixVectorActivation(Transformation): + """Convert (HLS-based) MatrixVectorActivation layers to specialized RTL layers if supported.""" + + def __init__(self): + super().__init__() + + def _is_rtl_variant_compatible(self, n): + no_activation = getCustomOp(n).get_nodeattr("noActivation") == 1 + act_width_in_range = (DataType[getCustomOp(n).get_nodeattr("inputDataType")].bitwidth() <= 8) or (DataType[getCustomOp(n).get_nodeattr("inputDataType")].bitwidth() == 9 and DataType[getCustomOp(n).get_nodeattr("inputDataType")].min() < 0) + weight_width_in_range = DataType[getCustomOp(n).get_nodeattr("weightDataType")].bitwidth() <= 8 + folding_supported = (getCustomOp(n).get_nodeattr("MH") % getCustomOp(n).get_nodeattr("PE") == 0) and (getCustomOp(n).get_nodeattr("MW") % getCustomOp(n).get_nodeattr("SIMD") == 0) + + if (no_activation and act_width_in_range and weight_width_in_range and folding_supported): + return True + else: + return False + + + def apply(self, model): + graph = model.graph + node_ind = 0 + graph_modified = False + for n in graph.node: + node_ind += 1 + if n.op_type == "MatrixVectorActivation": + preferred_in_rtl = getCustomOp(n).get_nodeattr("impl") == "rtl" and getCustomOp(n).get_nodeattr("resType") == "dsp" + supported_in_rtl = self._is_rtl_variant_compatible(n) + if (preferred_in_rtl and supported_in_rtl): + mvau_input = n.input[0] + mvau_weight = n.input[1] + mvau_output = n.output[0] + inputDataType = getCustomOp(n).get_nodeattr("inputDataType") + weightDataType = getCustomOp(n).get_nodeattr("weightDataType") + outputDataType = getCustomOp(n).get_nodeattr("outputDataType") + numInputVectors = getCustomOp(n).get_nodeattr("numInputVectors") + mw = getCustomOp(n).get_nodeattr("MW") + mh = getCustomOp(n).get_nodeattr("MH") + simd = getCustomOp(n).get_nodeattr("SIMD") + pe = getCustomOp(n).get_nodeattr("PE") + mem_mode = getCustomOp(n).get_nodeattr("mem_mode") + + new_node = helper.make_node( + "MatrixVectorActivation_rtl", + [mvau_input, mvau_weight], + [mvau_output], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + MW=mw, + MH=mh, + SIMD=simd, + PE=pe, + inputDataType=inputDataType, + weightDataType=weightDataType, + outputDataType=outputDataType, + numInputVectors=numInputVectors, + mem_mode=mem_mode, + name=n.name + "_rtl", + ) + graph.node.insert(node_ind, new_node) + # remove old node + graph.node.remove(n) + graph_modified=True + + if graph_modified: + model = model.transform(MinimizeAccumulatorWidth()) + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + + return (model, graph_modified) \ No newline at end of file From 5a868d19e5955abdb894bf1e8b93d2d1f6f8410d Mon Sep 17 00:00:00 2001 From: Yaman Umuroglu Date: Tue, 9 May 2023 09:41:15 +0200 Subject: [PATCH 028/123] [rtl mvu] fixes for latest memstream + linting --- .../matrixvectoractivation_rtl.py | 136 ++++++++++-------- 1 file changed, 77 insertions(+), 59 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py index 6b1c2f3be7..8fd261d395 100644 --- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py +++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py @@ -27,7 +27,6 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import math -from shutil import copy import numpy as np import os import textwrap @@ -40,20 +39,18 @@ ) from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp +from finn.util.basic import get_rtlsim_trace_depth, make_build_dir from finn.util.data_packing import ( npy_to_rtlsim_input, - numpy_to_hls_code, pack_innermost_dim_as_hex_string, rtlsim_output_to_npy, ) -from finn.util.basic import get_rtlsim_trace_depth, make_build_dir try: from pyverilator import PyVerilator except ModuleNotFoundError: PyVerilator = None -from . import templates # ONNX i/o tensor shape assumptions for MatrixVectorActivation: # input 0 is the input tensor, shape (.., i_size) = (..., MW) @@ -69,7 +66,6 @@ class MatrixVectorActivation_rtl(HLSCustomOp): def __init__(self, onnx_node, **kwargs): super().__init__(onnx_node, **kwargs) - self.decoupled_wrapper = templates.decoupled_wrapper def get_nodeattr_types(self): my_attrs = { @@ -186,17 +182,24 @@ def verify_node(self): ) num_of_inputs = len(self.onnx_node.input) - if num_of_inputs!=2: - info_messages.append("RTL-based MatrixVectorActivation expects two inputs (weights and activation), but got {} inputs.".format(len(self.onnx_node.input))) + if num_of_inputs != 2: + info_messages.append( + "RTL-based MatrixVectorActivation expects two inputs " + "(weights and activation), but got {} inputs.".format( + len(self.onnx_node.input) + ) + ) mem_mode = self.get_nodeattr("mem_mode") if mem_mode != "decoupled": - info_messages.append("RTL-based MVAU supports only decoupled weights currently") + info_messages.append( + "RTL-based MVAU supports only decoupled weights currently" + ) return info_messages -# TODO: Add in replay_buffer estimation + # TODO: Add in replay_buffer estimation def uram_estimation(self): P = self.get_nodeattr("PE") Q = self.get_nodeattr("SIMD") @@ -218,7 +221,7 @@ def uram_estimation(self): depth_multiplier = math.ceil(omega / 4096) return width_multiplier * depth_multiplier -# TODO: Add in replay_buffer estimation + # TODO: Add in replay_buffer estimation def bram_estimation(self): """Calculates resource estimation for BRAM based on: - FINN-R: An End-to-End Deep-Learning Framework for Fast @@ -259,7 +262,7 @@ def bram_estimation(self): else: return (math.ceil(omega / 512)) * (math.ceil(mem_width / 36)) -# TODO: Add in replay_buffer estimation + # TODO: Add in replay_buffer estimation def bram_efficiency_estimation(self): wdt = self.get_weight_datatype() W = wdt.bitwidth() @@ -272,7 +275,7 @@ def bram_efficiency_estimation(self): bram16_est_capacity = bram16_est * 36 * 512 return wbits / bram16_est_capacity -# TODO: Add in replay_buffer estimation + # TODO: Add in replay_buffer estimation def uram_efficiency_estimation(self): """Function for URAM efficiency estimation: actual parameter storage needed divided by the allocated URAM storage (from estimation)""" @@ -287,7 +290,7 @@ def uram_efficiency_estimation(self): uram_est_capacity = uram_est * 72 * 4096 return wbits / uram_est_capacity -#TODO: FIX: worst case estimates since segmentlen is not known at this point? + # TODO: FIX: worst case estimates since segmentlen is not known at this point? def lut_estimation(self): """Calculates resource estimations for LUTs based on: - FINN-R: An End-to-End Deep-Learning Framework for Fast @@ -328,13 +331,9 @@ def lut_estimation(self): acc_bits = W + A + np.ceil(math.log(MW, 2)) acc_luts = acc_bits - return int( - c0 - + c1 * (P * (mult_luts + addertree_luts + acc_luts)) - + c2 - ) + return int(c0 + c1 * (P * (mult_luts + addertree_luts + acc_luts)) + c2) -#TODO: FIX: worst case estimates since segmentlen is not known at this point? + # TODO: FIX: worst case estimates since segmentlen is not known at this point? def dsp_estimation(self): # multiplication P = self.get_nodeattr("PE") @@ -350,7 +349,7 @@ def dsp_estimation(self): mult_dsp = 0 return int(mult_dsp) -#TODO: FIX: worst case estimates since segmentlen is not known at this point + # TODO: FIX: worst case estimates since segmentlen is not known at this point def get_exp_cycles(self): pe = self.get_nodeattr("PE") simd = self.get_nodeattr("SIMD") @@ -359,7 +358,9 @@ def get_exp_cycles(self): mw = self.get_nodeattr("MW") # since mmv != 1 is not supported yet, we set mmv for now to 1 mmv = 1 - # Actual exp_cycles is probably slightly larger (say 3 cycles (DSP A/B, M, P - reg) + additional pipeline buffer cycles. Most probably <10) + # Actual exp_cycles is probably slightly larger (say 3 cycles + # (DSP A/B, M, P - reg) + additional pipeline buffer cycles. + # Most probably <10) exp_cycles = (mh / pe) * (mw / simd) * np.prod(num_inp_vec) / mmv return int(exp_cycles) @@ -384,7 +385,9 @@ def get_output_datatype(self, ind=0): def get_instream_width(self, ind=0): i_bits = self.get_input_datatype().bitwidth() - assert (i_bits<=9), "RTL-based MVAU only supports activations with bit-width up to 9-bits" + assert ( + i_bits <= 9 + ), "RTL-based MVAU only supports activations with bit-width up to 9-bits" in_width = i_bits * self.get_nodeattr("SIMD") return in_width @@ -402,7 +405,9 @@ def get_weightstream_width(self): pe = self.get_nodeattr("PE") simd = self.get_nodeattr("SIMD") wp = self.get_weight_datatype().bitwidth() - assert (wp <= 8), "RTL-based MVAU only supports weights with bit-width up to 8-bits" + assert ( + wp <= 8 + ), "RTL-based MVAU only supports weights with bit-width up to 8-bits" w_width = pe * simd * wp return w_width else: @@ -516,7 +521,8 @@ def minimize_accumulator_width(self, model): else: adt = DataType.get_smallest_possible(acc_max) # Note: we are interested in simply the width of the output dot product. - # Padding the actual output stream to a multiple of 8-bits is done in the RTL component + # Padding the actual output stream to a multiple of 8-bits is done in + # the RTL component self.set_nodeattr("accDataType", adt.name) # for no-activation nodes, output dt = acc dt self.set_nodeattr("outputDataType", adt.name) @@ -615,9 +621,7 @@ def generate_params(self, model, path): # and one for synthesis. this is because URAM-based weights always # need zero weights for synthesis, otherwise they get inferred # as BRAM - weight_filename_rtl_synth = "{}/memblock_synth_0.dat".format( - code_gen_dir - ) + weight_filename_rtl_synth = "{}/memblock_synth_0.dat".format(code_gen_dir) weight_filename_rtl_sim = "{}/memblock_sim_0.dat".format(code_gen_dir) # sim weights are always the true weights self.make_weight_file( @@ -734,11 +738,11 @@ def code_generation_ipgen(self, model, fpgapart, clk): def ipgen_singlenode_code(self): """Normally: Builds the bash script for IP generation.""" - pass + pass def code_generation_cppsim(self, model): """Normally: Generates C++ code for simulation (cppsim).""" - pass + pass def compile_singlenode_code(self): pass @@ -803,19 +807,28 @@ def code_generation_ipi(self): code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/mvu/") sourcefiles = [ - os.path.join(code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"), + os.path.join( + code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v" + ), rtllib_dir + "mvu_axi.sv", rtllib_dir + "replay_buffer.sv", rtllib_dir + "mvu_4sx4u.sv", rtllib_dir + "mvu_8sx9.sv", - rtllib_dir + "mvu_8sx8u_dsp48.sv" + rtllib_dir + "mvu_8sx8u_dsp48.sv", ] for f in sourcefiles: cmd.append("add_files -norecurse %s" % (f)) - cmd.append("create_bd_cell -type hier -reference %s /%s/%s" % (self.get_nodeattr("gen_top_module"), self.onnx_node.name, self.onnx_node.name)) + cmd.append( + "create_bd_cell -type hier -reference %s /%s/%s" + % ( + self.get_nodeattr("gen_top_module"), + self.onnx_node.name, + self.onnx_node.name, + ) + ) # instantiate a streamer and connect it to the HLS IP - strm_vlnv = "xilinx.com:user:memstream:1.0" + strm_vlnv = "amd.com:FINN:memstream:1.0" strm_inst = node_name + "_wstrm" cmd.append( "create_bd_cell -type ip -vlnv %s /%s/%s" @@ -849,11 +862,11 @@ def code_generation_ipi(self): % (node_name, strm_inst, node_name, node_name, sname) ) cmd.append( - "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/aresetn]" + "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/ap_rst_n]" % (node_name, rst_name, node_name, strm_inst) ) cmd.append( - "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/aclk]" + "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/ap_clk]" % (node_name, clk_name, node_name, strm_inst) ) cmd.append( @@ -947,21 +960,25 @@ def derive_characteristic_fxns(self, period): ] super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict) -# TODO: characterize max_clk and implement this function in look-up style + # TODO: characterize max_clk and implement this function in look-up style def _resolve_segment_len(self, clk): # Insert pipeline registers in the DSP chain to meet target clock frequency segmentlen = 0 return segmentlen def _resolve_impl_style(self, fpgapart): - # Based on target device and activation/weight-width, choose the supported RTL module + # Based on target device and activation/weight-width, choose the + # supported RTL module act_width = self.get_input_datatype(0).bitwidth() weight_width = self.get_input_datatype(1).bitwidth() - is_versal = fpgapart[0:4] in ["xcvc", "xcve", "xcvp", "xcvm", "xqvc", "xqvm"] or fpgapart[0:5] == "xqrvc" - if (act_width == 4 and weight_width == 4): + is_versal = ( + fpgapart[0:4] in ["xcvc", "xcve", "xcvp", "xcvm", "xqvc", "xqvm"] + or fpgapart[0:5] == "xqrvc" + ) + if act_width == 4 and weight_width == 4: return "mvu_4sx4u" else: - if (is_versal): + if is_versal: return "mvu_8sx9_dsp58" else: return "mvu_8sx8u_dsp48" @@ -973,13 +990,17 @@ def generate_hdl(self, model, fpgapart, clk): template_path, code_gen_dict = self.prepare_codegen_default(fpgapart, clk) # add general parameters to dictionary - code_gen_dict["$MODULE_NAME_AXI_WRAPPER$"] = [self.get_verilog_top_module_name()] + code_gen_dict["$MODULE_NAME_AXI_WRAPPER$"] = [ + self.get_verilog_top_module_name() + ] # save top module name so we can refer to it after this node has been renamed # (e.g. by GiveUniqueNodeNames(prefix) during MakeZynqProject) self.set_nodeattr("gen_top_module", self.get_verilog_top_module_name()) ram_style = self.get_nodeattr("ram_style") - assert (ram_style=="auto"), "Unrecognized ram_style for MatrixVectorActivation_rtl" + assert ( + ram_style == "auto" + ), "Unrecognized ram_style for MatrixVectorActivation_rtl" # apply code generation to template with open(template_path, "r") as f: @@ -1009,19 +1030,21 @@ def generate_hdl(self, model, fpgapart, clk): self.set_nodeattr("ip_path", code_gen_dir) def prepare_codegen_default(self, fpgapart, clk): - template_path = ( - os.environ["FINN_ROOT"] + "/finn-rtllib/mvu/mvu_axi_wrapper.v" - ) - + template_path = os.environ["FINN_ROOT"] + "/finn-rtllib/mvu/mvu_axi_wrapper.v" + code_gen_dict = {} code_gen_dict["$MW$"] = [str(self.get_nodeattr("MW"))] code_gen_dict["$MH$"] = [str(self.get_nodeattr("MH"))] code_gen_dict["$PE$"] = [str(self.get_nodeattr("PE"))] code_gen_dict["$SIMD$"] = [str(self.get_nodeattr("SIMD"))] - code_gen_dict["$ACTIVATION_WIDTH$"] = [str(self.get_input_datatype(0).bitwidth())] + code_gen_dict["$ACTIVATION_WIDTH$"] = [ + str(self.get_input_datatype(0).bitwidth()) + ] code_gen_dict["$WEIGHT_WIDTH$"] = [str(self.get_input_datatype(1).bitwidth())] code_gen_dict["$ACCU_WIDTH$"] = [str(self.get_output_datatype().bitwidth())] - code_gen_dict["$SIGNED_ACTIVATIONS$"] = [str(1)] if (self.get_input_datatype(0).min() < 0) else [str(0)] + code_gen_dict["$SIGNED_ACTIVATIONS$"] = ( + [str(1)] if (self.get_input_datatype(0).min() < 0) else [str(0)] + ) code_gen_dict["$SEGMENTLEN$"] = [str(self._resolve_segment_len(clk))] code_gen_dict["$MVU_IMPL_STYLE$"] = [self._resolve_impl_style(fpgapart)] @@ -1035,15 +1058,10 @@ def prepare_rtlsim(self): if PyVerilator is None: raise ImportError("Installation of PyVerilator is required.") - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") # Path to (System-)Verilog files used by top-module & path to top-module - verilog_paths = [ - code_gen_dir, - os.environ["FINN_ROOT"] + "/finn-rtllib/mvu" - ] - verilog_files = [ - self.get_nodeattr("gen_top_module") + "_wrapper_sim.v" - ] + verilog_paths = [code_gen_dir, os.environ["FINN_ROOT"] + "/finn-rtllib/mvu"] + verilog_files = [self.get_nodeattr("gen_top_module") + "_wrapper_sim.v"] # build the Verilator emu library sim = PyVerilator.build( @@ -1051,9 +1069,9 @@ def prepare_rtlsim(self): build_dir=make_build_dir("pyverilator_" + self.onnx_node.name + "_"), verilog_path=verilog_paths, trace_depth=get_rtlsim_trace_depth(), - top_module_name=self.get_verilog_top_module_name() + top_module_name=self.get_verilog_top_module_name(), ) # save generated lib filename in attribute self.set_nodeattr("rtlsim_so", sim.lib._name) - - return sim \ No newline at end of file + + return sim From 4a9cfa1c7a17497578faad3f76c25b80c116ba58 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 11 May 2023 10:56:07 +0100 Subject: [PATCH 029/123] [rtl custom_op]: add support for external weights --- .../matrixvectoractivation_rtl.py | 67 ++++++++++--------- 1 file changed, 37 insertions(+), 30 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py index 8fd261d395..162b5e2e16 100644 --- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py +++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py @@ -192,9 +192,9 @@ def verify_node(self): mem_mode = self.get_nodeattr("mem_mode") - if mem_mode != "decoupled": + if mem_mode not in ["decoupled", "external"]: info_messages.append( - "RTL-based MVAU supports only decoupled weights currently" + "RTL-based MVAU supports only decoupled or external weights." ) return info_messages @@ -612,35 +612,20 @@ def generate_params(self, model, path): code_gen_dir = path # weights, if not external weights = model.get_initializer(self.onnx_node.input[1]) - if mem_mode == "decoupled": + if mem_mode in ["decoupled", "external"]: weight_filename_sim = "{}/weights.npy".format(code_gen_dir) # save decoupled weights for cppsim self.make_weight_file(weights, "decoupled_npy", weight_filename_sim) - # Also save weights as Verilog .dat file - # note that we provide two different .dat files, one for synth - # and one for synthesis. this is because URAM-based weights always - # need zero weights for synthesis, otherwise they get inferred - # as BRAM - weight_filename_rtl_synth = "{}/memblock_synth_0.dat".format(code_gen_dir) - weight_filename_rtl_sim = "{}/memblock_sim_0.dat".format(code_gen_dir) - # sim weights are always the true weights - self.make_weight_file( - weights, "decoupled_verilog_dat", weight_filename_rtl_sim - ) - ram_style = self.get_nodeattr("ram_style") - if ram_style == "ultra": - # UltraRAM must have no memory initializer, or only zeroes - # otherwise BRAM will be inferred instead of URAM - # as a workaround we provide a zero-weight init here - synth_weights = np.zeros_like(weights, dtype=np.float32) - else: - synth_weights = weights - self.make_weight_file( - synth_weights, "decoupled_verilog_dat", weight_filename_rtl_synth - ) + if mem_mode == "decoupled": + # also save weights as Verilog .dat file + # This file will be ignored when synthesizing UltraScale memory. + weight_filename_rtl = "{}/memblock.dat".format(code_gen_dir) + self.make_weight_file( + weights, "decoupled_verilog_dat", weight_filename_rtl + ) else: raise Exception( - """Please set mem_mode to "decoupled", + """Please set mem_mode to "const", "decoupled", or "external", currently no other parameter value is supported!""" ) @@ -695,7 +680,7 @@ def execute_node(self, context, graph): ) super().reset_rtlsim(sim) super().toggle_clk(sim) - if mem_mode == "external" or mem_mode == "decoupled": + if mem_mode in ["external", "decoupled"]: wnbits = self.get_weightstream_width() export_wdt = self.get_weight_datatype() wei = npy_to_rtlsim_input( @@ -903,9 +888,31 @@ def code_generation_ipi(self): # TODO calculate and pass in segment size here cmd.append("assign_bd_address") cmd.append("save_bd_design") - elif mem_mode == "const" or mem_mode == "external": - # base class impl sufficient for const/external modes - return super().code_generation_ipi() + elif mem_mode == "external": + # instantiate the RTL block + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/mvu/") + sourcefiles = [ + os.path.join( + code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v" + ), + rtllib_dir + "mvu_axi.sv", + rtllib_dir + "replay_buffer.sv", + rtllib_dir + "mvu_4sx4u.sv", + rtllib_dir + "mvu_8sx9.sv", + rtllib_dir + "mvu_8sx8u_dsp48.sv", + ] + for f in sourcefiles: + cmd.append("add_files -norecurse %s" % (f)) + cmd.append( + "create_bd_cell -type module -reference %s %s" + % ( + self.get_nodeattr("gen_top_module"), + self.onnx_node.name, + ) + ) + cmd.append("set_property CONFIG.FREQ_HZ 333333333.333333 [get_bd_intf_pins %s/in0_V]" % (self.onnx_node.name)) + cmd.append("set_property CONFIG.FREQ_HZ 333333333.333333 [get_bd_intf_pins %s/out_V]" % (self.onnx_node.name)) else: raise Exception("Unrecognized mem_mode for MatrixVectorActivation") return cmd From 8a9ac1af4d6c62e7c9557ab41992b84cf2c37ae1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= Date: Thu, 11 May 2023 11:04:28 +0100 Subject: [PATCH 030/123] Specify clock and reset associations of bus interfaces. --- finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v | 4 +++- finn-rtllib/mvu/mvu_axi_wrapper.v | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v b/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v index 502a72d3f2..fb3c62a15a 100644 --- a/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v +++ b/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v @@ -49,8 +49,10 @@ module $MODULE_NAME_AXI_WRAPPER$ #( parameter OUTPUT_LANES = PE, parameter OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8 )( - // Global Control + // Global Control + (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF s_axis_weights:s_axis_input:m_axis_output" *) input logic ap_clk, + (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF s_axis_weights:s_axis_input:m_axis_output" *) input logic ap_rst_n, // Weight Stream diff --git a/finn-rtllib/mvu/mvu_axi_wrapper.v b/finn-rtllib/mvu/mvu_axi_wrapper.v index b79ba6bbd1..d8acaefcc7 100644 --- a/finn-rtllib/mvu/mvu_axi_wrapper.v +++ b/finn-rtllib/mvu/mvu_axi_wrapper.v @@ -50,8 +50,10 @@ module $MODULE_NAME_AXI_WRAPPER$ #( parameter OUTPUT_LANES = PE, parameter OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8 )( - // Global Control + // Global Control + (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF weights_V:in0_V:out_V" *) input ap_clk, + (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF weights_V:in0_V:out_V" *) input ap_rst_n, // Weight Stream input [WEIGHT_STREAM_WIDTH_BA-1:0] weights_V_TDATA, From d9b90793bd54a5e112531c737fa7c60a51b21d34 Mon Sep 17 00:00:00 2001 From: Yaman Umuroglu Date: Mon, 15 May 2023 10:16:48 +0200 Subject: [PATCH 031/123] [rtlmvu] More fixes for memstream and param gen --- .../fpgadataflow/matrixvectoractivation_rtl.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py index 162b5e2e16..1791327e78 100644 --- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py +++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py @@ -612,7 +612,11 @@ def generate_params(self, model, path): code_gen_dir = path # weights, if not external weights = model.get_initializer(self.onnx_node.input[1]) +<<<<<<< HEAD if mem_mode in ["decoupled", "external"]: +======= + if mem_mode == "decoupled" or mem_mode == "external": +>>>>>>> 72fe4c5b ([rtlmvu] More fixes for memstream and param gen) weight_filename_sim = "{}/weights.npy".format(code_gen_dir) # save decoupled weights for cppsim self.make_weight_file(weights, "decoupled_npy", weight_filename_sim) @@ -821,22 +825,16 @@ def code_generation_ipi(self): ) cmd.append( "set_property -dict [list " - "CONFIG.NSTREAMS {1} " - "CONFIG.MEM_DEPTH {%d} " - "CONFIG.MEM_WIDTH {%d} " - "CONFIG.MEM_INIT {%s} " + "CONFIG.DEPTH {%d} " + "CONFIG.WIDTH {%d} " + "CONFIG.INIT_FILE {%s} " "CONFIG.RAM_STYLE {%s} " - "CONFIG.STRM0_DEPTH {%d} " - "CONFIG.STRM0_WIDTH {%d} " - "CONFIG.STRM0_OFFSET {0} " "] [get_bd_cells /%s/%s]" % ( self.calc_wmem(), self.get_weightstream_width_padded(), - self.get_nodeattr("code_gen_dir_ipgen") + "/", + self.get_nodeattr("code_gen_dir_ipgen") + "/memblock.dat", self.get_nodeattr("ram_style"), - self.calc_wmem(), - self.get_weightstream_width_padded(), node_name, strm_inst, ) From a5f2a83897e33acb4b3e2231d9bfa534e56bb6b2 Mon Sep 17 00:00:00 2001 From: Yaman Umuroglu Date: Thu, 11 May 2023 23:49:10 +0200 Subject: [PATCH 032/123] [Build] apply config to only FIFO nodes in step_set_fifo_depths --- src/finn/builder/build_dataflow_steps.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py index 65ab2b0b93..d4af757491 100644 --- a/src/finn/builder/build_dataflow_steps.py +++ b/src/finn/builder/build_dataflow_steps.py @@ -53,6 +53,7 @@ from shutil import copy import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls +import finn.transformation.fpgadataflow.specialize_to_rtl_layers as to_rtl import finn.transformation.streamline.absorb as absorb from finn.analysis.fpgadataflow.dataflow_performance import dataflow_performance from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer @@ -123,7 +124,6 @@ ) from finn.util.pyverilator import verilator_fifosim from finn.util.test import execute_parent -import finn.transformation.fpgadataflow.specialize_to_rtl_layers as to_rtl def verify_step( @@ -486,14 +486,13 @@ def step_generate_estimate_reports(model: ModelWrapper, cfg: DataflowBuildConfig def step_specialize_to_rtl(model: ModelWrapper, cfg: DataflowBuildConfig): - """Convert layers implemented in HLS to an equivalent specialized RTL implementation if possible.""" - specialize_to_rtl_transforms = [ - to_rtl.InferRTLMatrixVectorActivation() - ] + """Convert layers implemented in HLS to an equivalent specialized RTL + implementation if possible.""" + specialize_to_rtl_transforms = [to_rtl.InferRTLMatrixVectorActivation()] for trn in specialize_to_rtl_transforms: model = model.transform(trn) return model - + def step_minimize_bit_width(model: ModelWrapper, cfg: DataflowBuildConfig): """Tighten the weight and accumulator bit widths for each layer.""" @@ -594,7 +593,12 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig): model = model.transform(GiveUniqueNodeNames()) model = model.transform(GiveReadableTensorNames()) if cfg.folding_config_file is not None: - model = model.transform(ApplyConfig(cfg.folding_config_file)) + model = model.transform( + ApplyConfig( + cfg.folding_config_file, + node_filter=lambda x: x.op_type == "StreamingFIFO", + ) + ) # extract the final configuration and save it as json hw_attrs = [ From 08cbdc59a95ed6281c3234c5e8b0b9d7327a2988 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= Date: Wed, 24 May 2023 07:58:41 +0100 Subject: [PATCH 033/123] Revised control interface attributes. --- finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v | 29 +++++++++++++------------- finn-rtllib/mvu/mvu_axi_wrapper.v | 8 ++++--- 2 files changed, 20 insertions(+), 17 deletions(-) diff --git a/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v b/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v index fb3c62a15a..e15f77fbae 100644 --- a/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v +++ b/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v @@ -50,25 +50,26 @@ module $MODULE_NAME_AXI_WRAPPER$ #( parameter OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8 )( // Global Control - (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF s_axis_weights:s_axis_input:m_axis_output" *) - input logic ap_clk, - (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF s_axis_weights:s_axis_input:m_axis_output" *) - input logic ap_rst_n, + (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF s_axis_weights:s_axis_input:m_axis_output, ASSOCIATED_RESET ap_rst_n" *) + (* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk CLK" *) + input ap_clk, + (* X_INTERFACE_PARAMETER = "POLARITY ACTIVE_LOW" *) + input ap_rst_n, // Weight Stream - input logic [WEIGHT_STREAM_WIDTH_BA-1:0] s_axis_weights_tdata, - input logic s_axis_weights_tvalid, - output logic s_axis_weights_tready, + input [WEIGHT_STREAM_WIDTH_BA-1:0] s_axis_weights_tdata, + input s_axis_weights_tvalid, + output s_axis_weights_tready, // Input Stream - input logic [INPUT_STREAM_WIDTH_BA-1:0] s_axis_input_tdata, - input logic s_axis_input_tvalid, - output logic s_axis_input_tready, + input [INPUT_STREAM_WIDTH_BA-1:0] s_axis_input_tdata, + input s_axis_input_tvalid, + output s_axis_input_tready, // Output Stream - output logic [OUTPUT_STREAM_WIDTH_BA-1:0] m_axis_output_tdata, - output logic m_axis_output_tvalid, - input logic m_axis_output_tready + output [OUTPUT_STREAM_WIDTH_BA-1:0] m_axis_output_tdata, + output m_axis_output_tvalid, + input m_axis_output_tready ); mvu_8sx9_axi #( @@ -89,4 +90,4 @@ mvu_8sx9_axi #( .m_axis_output_tready(m_axis_output_tready) ); -endmodule : $MODULE_NAME_AXI_WRAPPER$ \ No newline at end of file +endmodule : $MODULE_NAME_AXI_WRAPPER$ diff --git a/finn-rtllib/mvu/mvu_axi_wrapper.v b/finn-rtllib/mvu/mvu_axi_wrapper.v index d8acaefcc7..239c5bbacd 100644 --- a/finn-rtllib/mvu/mvu_axi_wrapper.v +++ b/finn-rtllib/mvu/mvu_axi_wrapper.v @@ -51,10 +51,12 @@ module $MODULE_NAME_AXI_WRAPPER$ #( parameter OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8 )( // Global Control - (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF weights_V:in0_V:out_V" *) + (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF weights_V:in0_V:out_V, ASSOCIATED_RESET ap_rst_n" *) + (* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk CLK" *) input ap_clk, - (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF weights_V:in0_V:out_V" *) + (* X_INTERFACE_PARAMETER = "POLARITY ACTIVE_LOW" *) input ap_rst_n, + // Weight Stream input [WEIGHT_STREAM_WIDTH_BA-1:0] weights_V_TDATA, input weights_V_TVALID, @@ -87,4 +89,4 @@ mvu_axi #( .m_axis_output_tready(out_V_TREADY) ); -endmodule : $MODULE_NAME_AXI_WRAPPER$ \ No newline at end of file +endmodule : $MODULE_NAME_AXI_WRAPPER$ From d058cc2a5c1ed71a2c2ea12034cfa921818381ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= Date: Wed, 24 May 2023 09:16:50 +0100 Subject: [PATCH 034/123] Mask device primitives from Verilator in favor of using behavioral code. --- finn-rtllib/mvu/mvu_4sx4u.sv | 38 ++++++++++++++++++++---------- finn-rtllib/mvu/mvu_8sx8u_dsp48.sv | 38 ++++++++++++++++++++---------- finn-rtllib/mvu/mvu_8sx9.sv | 29 ++++++++++++++--------- 3 files changed, 68 insertions(+), 37 deletions(-) diff --git a/finn-rtllib/mvu/mvu_4sx4u.sv b/finn-rtllib/mvu/mvu_4sx4u.sv index 5993154355..21594e46ac 100644 --- a/finn-rtllib/mvu/mvu_4sx4u.sv +++ b/finn-rtllib/mvu/mvu_4sx4u.sv @@ -19,6 +19,12 @@ module mvu_4sx4u #( output logic vld, output logic signed [PE-1:0][ACCU_WIDTH-1:0] p ); + // Verilator always to use behavioral code + localparam bit BEHAVIORAL = +`ifdef VERILATOR + 1 || +`endif + FORCE_BEHAVIORAL; typedef int unsigned leave_load_t[2*SIMD-1]; function leave_load_t init_leave_loads(); @@ -59,17 +65,21 @@ module mvu_4sx4u #( for(genvar pe = 0; pe < PE_END - PE_BEG; pe++) begin assign ww[pe] = w[PE_BEG + pe][s]; if(pe) begin -// assign xx[pe] = zero? 0 : ww[pe] * a[s]; - LUT6_2 #(.INIT(64'h0000_6AC0_0000_8888)) lut_x ( - .O6(xx[pe][1]), - .O5(xx[pe][0]), - .I5(1'b1), - .I4(zero), - .I3(ww[pe][1]), - .I2(a[s][1]), - .I1(ww[pe][0]), - .I0(a[s][0]) - ); + if(BEHAVIORAL) assign xx[pe] = zero? 0 : ww[pe] * a[s]; +`ifndef VERILATOR + else begin + LUT6_2 #(.INIT(64'h0000_6AC0_0000_8888)) lut_x ( + .O6(xx[pe][1]), + .O5(xx[pe][0]), + .I5(1'b1), + .I4(zero), + .I3(ww[pe][1]), + .I2(a[s][1]), + .I1(ww[pe][0]), + .I0(a[s][0]) + ); + end +`endif end end always_comb begin @@ -87,7 +97,7 @@ module mvu_4sx4u #( // Note: Since the product B * AD is computed, // rst can be only applied to AD and zero only to B // with the same effect as zeroing both. - if (FORCE_BEHAVIORAL) begin : genBehav + if (BEHAVIORAL) begin : genBehav // Stage #1: Input Refine logic signed [23:0] B1 = 0; always_ff @(posedge clk) begin @@ -121,6 +131,7 @@ module mvu_4sx4u #( assign pp = P3; end : genBehav +`ifndef VERILATOR else begin : genDSP DSP48E2 #( // Feature Control Attributes: Data Path Selection @@ -252,6 +263,7 @@ module mvu_4sx4u #( .RSTP(rst) // 1-bit input: Reset for PREG ); end : genDSP +`endif // External Canary Pipeline logic [1:0] X1[3:1] = '{ default: 0 }; @@ -356,4 +368,4 @@ module mvu_4sx4u #( end : genPipes -endmodule : mvu_4sx4u \ No newline at end of file +endmodule : mvu_4sx4u diff --git a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv index e06a92c8fa..09db360b77 100644 --- a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv +++ b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv @@ -23,6 +23,12 @@ module mvu_8sx8u_dsp48 #( output logic vld, output logic signed [PE-1:0][ACCU_WIDTH-1:0] p ); + // Verilator always to use behavioral code + localparam bit BEHAVIORAL = +`ifdef VERILATOR + 1 || +`endif + FORCE_BEHAVIORAL; typedef int unsigned leave_load_t[2*SIMD-1]; function leave_load_t init_leave_loads(); @@ -63,17 +69,21 @@ module mvu_8sx8u_dsp48 #( for(genvar pe = 0; pe < PE_END - PE_BEG; pe++) begin assign ww[pe] = w[PE_BEG + pe][s]; if(pe) begin -// assign xx[pe] = zero? 0 : ww[pe] * a[s]; - LUT6_2 #(.INIT(64'h0000_6AC0_0000_8888)) lut_x ( - .O6(xx[1]), - .O5(xx[0]), - .I5(1'b1), - .I4(zero), - .I3(ww[pe][1]), - .I2(a[s][1]), - .I1(ww[pe][0]), - .I0(a[s][0]) - ); + if(BEHAVIORAL) assign xx[pe] = zero? 0 : ww[pe] * a[s]; +`ifndef VERILATOR + else begin + LUT6_2 #(.INIT(64'h0000_6AC0_0000_8888)) lut_x ( + .O6(xx[1]), + .O5(xx[0]), + .I5(1'b1), + .I4(zero), + .I3(ww[pe][1]), + .I2(a[s][1]), + .I1(ww[pe][0]), + .I0(a[s][0]) + ); + end +`endif end end always_comb begin @@ -91,7 +101,7 @@ module mvu_8sx8u_dsp48 #( // Note: Since the product B * AD is computed, // rst can be only applied to AD and zero only to B // with the same effect as zeroing both. - if (FORCE_BEHAVIORAL) begin : genBehav + if(BEHAVIORAL) begin : genBehav // Stage #1: Input Refine logic signed [23:0] B1 = 0; always_ff @(posedge clk) begin @@ -125,6 +135,7 @@ module mvu_8sx8u_dsp48 #( assign pp = P3; end : genBehav +`ifndef VERILATOR else begin : genDSP DSP48E2 #( // Feature Control Attributes: Data Path Selection @@ -256,6 +267,7 @@ module mvu_8sx8u_dsp48 #( .RSTP(rst) // 1-bit input: Reset for PREG ); end : genDSP +`endif // External Canary Pipeline logic [1:0] X1 = '{ default: 0 }; @@ -355,4 +367,4 @@ module mvu_8sx8u_dsp48 #( end : genPipes -endmodule : mvu_8sx8u_dsp48 \ No newline at end of file +endmodule : mvu_8sx8u_dsp48 diff --git a/finn-rtllib/mvu/mvu_8sx9.sv b/finn-rtllib/mvu/mvu_8sx9.sv index 2d1da26efb..f8e2ab3985 100644 --- a/finn-rtllib/mvu/mvu_8sx9.sv +++ b/finn-rtllib/mvu/mvu_8sx9.sv @@ -52,11 +52,17 @@ module mvu_8sx9 #( input logic zero, // ignore current inputs and force this partial product to zero input logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] w, // weights input logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] a, // activations - + // Ouput output logic vld, output logic [PE-1:0][ACCU_WIDTH-1:0] p ); + // Verilator always to use behavioral code + localparam bit BEHAVIORAL = +`ifdef VERILATOR + 1 || +`endif + FORCE_BEHAVIORAL; //-------------------- Declare global signals --------------------\\ localparam int unsigned CHAINLEN = (SIMD+2)/3; @@ -75,7 +81,7 @@ module mvu_8sx9 #( L[1+MAX_PIPELINE_STAGES] <= last; L[0:MAX_PIPELINE_STAGES] <= L[1:1+MAX_PIPELINE_STAGES]; end - end + end assign vld = L[0]; //-------------------- Shift register for ZERO flag --------------------\\ @@ -87,7 +93,7 @@ module mvu_8sx9 #( else if(en) begin Z[0] <= zero; if (MAX_PIPELINE_STAGES > 2) Z[1:MAX_PIPELINE_STAGES-1] <= Z[0:MAX_PIPELINE_STAGES-2]; - end + end end end; @@ -157,12 +163,12 @@ module mvu_8sx9 #( if (LAST) begin : genPOUT assign p[j] = pp[ACCU_WIDTH-1:0]; - end + end // Note: Since the product B * AD is computed, // rst can be only applied to AD and zero only to B // with the same effect as zeroing both. - if (FORCE_BEHAVIORAL) begin : genBehav + if(BEHAVIORAL) begin : genBehav // Stage #1: Input A/B logic signed [33:0] Areg [INTERNAL_PREGS]; always_ff @(posedge clk) begin @@ -233,7 +239,7 @@ module mvu_8sx9 #( assign pp = Preg; assign pcout[j][i] = pp; end : genBehav - +`ifndef VERILATOR else begin: genDSP DSP58 #( // Feature Control Attributes: Data Path Selection @@ -263,8 +269,8 @@ module mvu_8sx9 #( .IS_CLK_INVERTED(1'b0), // Optional inversion for CLK .IS_INMODE_INVERTED(5'b00000), // Optional inversion for INMODE .IS_NEGATE_INVERTED(3'b000), // Optional inversion for NEGATE - .IS_OPMODE_INVERTED({ LAST ? 2'b01 : 2'b00 , // W: LAST ? (L[1] ? 0 : P) : 0 - FIRST ? 3'b000 : 3'b001, // Z: FIRST ? 0 : PCIN + .IS_OPMODE_INVERTED({ LAST ? 2'b01 : 2'b00 , // W: LAST ? (L[1] ? 0 : P) : 0 + FIRST ? 3'b000 : 3'b001, // Z: FIRST ? 0 : PCIN 2'b01, // Y : M 2'b01 // X: M }), // Optional inversion for OPMODE @@ -325,7 +331,7 @@ module mvu_8sx9 #( INTERNAL_PREGS==2 ? 1'b0 : 1'b1, 2'b00, TOTAL_PREGS > 0 ? Z[TOTAL_PREGS-1] : zero, - INTERNAL_PREGS==2 ? 1'b0 : 1'b1 + INTERNAL_PREGS==2 ? 1'b0 : 1'b1 }), // 5-bit input: INMODE control .NEGATE('0), // 3-bit input: Negates the input of the multiplier .OPMODE({ @@ -365,7 +371,8 @@ module mvu_8sx9 #( .RSTP(PREG && rst) // 1-bit input: Reset for PREG ); end : genDSP - end : genDSPChain +`endif + end : genDSPChain end : genDSPPE - + endmodule : mvu_8sx9 From a66f38f2d06901fd27cf874701572268ea4793d6 Mon Sep 17 00:00:00 2001 From: Yaman Umuroglu Date: Thu, 11 May 2023 23:48:36 +0200 Subject: [PATCH 035/123] [Deps] update qonnx --- fetch-repos.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fetch-repos.sh b/fetch-repos.sh index e039ca9144..f1cf8754f2 100755 --- a/fetch-repos.sh +++ b/fetch-repos.sh @@ -27,7 +27,7 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -QONNX_COMMIT="20a34289cf2297d2b2bbbe75d6ac152ece86e3b4" +QONNX_COMMIT="bc36fd56bf1e4abfcf98cd76a001cad13d57baac" FINN_EXP_COMMIT="0aa7e1c44b20cf085b6fe42cff360f0a832afd2c" BREVITAS_COMMIT="c65f9c13dc124971f14739349531bbcda5c2a4aa" PYVERILATOR_COMMIT="766e457465f5c0dd315490d7b9cc5d74f9a76f4f" From 8f9bd04b3311e56da4684a58d4de868d61f342ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= Date: Wed, 24 May 2023 12:44:53 +0100 Subject: [PATCH 036/123] Adding folding hints. Impl selection by case statement. --- finn-rtllib/mvu/mvu_axi.sv | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/finn-rtllib/mvu/mvu_axi.sv b/finn-rtllib/mvu/mvu_axi.sv index e4a919ba88..a181f54ac5 100644 --- a/finn-rtllib/mvu/mvu_axi.sv +++ b/finn-rtllib/mvu/mvu_axi.sv @@ -29,6 +29,14 @@ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * @brief Matrix Vector Unit (MVU) AXI-lite interface wrapper. + * @details + * Folding hints: + * - 4-bit MVU: PE scaling should aim at a full multiple of 4. + * - 8-bit MVU - DSP48: PE scaling should aim at a full multiple of 2. + * - 8-bit MVU - DSP58: SIMD scaling should aim at a full multiple of 3. + * - Otherwise, keep SIMD and PE somewhat balanced. SIMD scaling tends to + * impact critical paths more than PE scaling. PE scaling implies a + * bigger fanout on the input activations. *****************************************************************************/ module mvu_axi #( @@ -134,8 +142,9 @@ module mvu_axi #( uwire ovld; uwire [PE-1:0][ACCU_WIDTH-1:0] odat; typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t; - - if (MVU_IMPL_STYLE == "mvu_8sx9_dsp58") begin : genMVU8sx9 + + case(MVU_IMPL_STYLE) + "mvu_8sx9_dsp58": mvu_8sx9 #(.PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( @@ -143,26 +152,27 @@ module mvu_axi #( .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau), .vld(ovld), .p(odat) ); - end - else if (MVU_IMPL_STYLE == "mvu_4sx4u") begin : genMVU4sx4u + + "mvu_4sx4u": mvu_4sx4u #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( .clk, .rst, .en, .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau), .vld(ovld), .p(odat) ); - end - else if (MVU_IMPL_STYLE == "mvu_8sx8u_dsp48") begin : genMVU8sx8u + + "mvu_8sx8u_dsp48": mvu_8sx8u_dsp48 #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( .clk, .rst, .en, .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau), .vld(ovld), .p(odat) ); - end - else initial begin - $error("Unrecognized MVU_IMPL_STYLE!"); + + default: initial begin + $error("Unrecognized MVU_IMPL_STYLE '%s'", MVU_IMPL_STYLE); $finish; end + endcase //-------------------- Output register slice --------------------\\ struct packed { @@ -185,7 +195,7 @@ module mvu_axi #( end end end - + struct packed { logic vld; logic [PE-1:0][ACCU_WIDTH-1:0] dat; @@ -196,10 +206,10 @@ module mvu_axi #( if(rst) B <= '{ default: 'x }; else begin if(b_load) B <= '{ vld: A.vld, dat: A.dat}; - end + end end assign m_axis_output_tvalid = B.vld; assign m_axis_output_tdata = B.dat; -endmodule : mvu_axi \ No newline at end of file +endmodule : mvu_axi From 9de5ed6f7b459f37bb127f0cd105e6f927d25611 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= Date: Wed, 24 May 2023 13:52:40 +0100 Subject: [PATCH 037/123] Fixed behavioral sideband prediction. --- finn-rtllib/mvu/mvu_8sx8u_dsp48.sv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv index 09db360b77..bd1f813af6 100644 --- a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv +++ b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv @@ -69,7 +69,7 @@ module mvu_8sx8u_dsp48 #( for(genvar pe = 0; pe < PE_END - PE_BEG; pe++) begin assign ww[pe] = w[PE_BEG + pe][s]; if(pe) begin - if(BEHAVIORAL) assign xx[pe] = zero? 0 : ww[pe] * a[s]; + if(BEHAVIORAL) assign xx = zero? 0 : ww[pe] * a[s]; `ifndef VERILATOR else begin LUT6_2 #(.INIT(64'h0000_6AC0_0000_8888)) lut_x ( From 239759a6a4b8cb008aa9b80d52d15f53f77e5965 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Wed, 24 May 2023 15:49:19 +0100 Subject: [PATCH 038/123] [rtl mvu]: extension to allow selecting PE values that are not multiples of 4 --- finn-rtllib/mvu/mvu_4sx4u.sv | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/finn-rtllib/mvu/mvu_4sx4u.sv b/finn-rtllib/mvu/mvu_4sx4u.sv index 21594e46ac..111d651cf5 100644 --- a/finn-rtllib/mvu/mvu_4sx4u.sv +++ b/finn-rtllib/mvu/mvu_4sx4u.sv @@ -50,6 +50,7 @@ module mvu_4sx4u #( localparam int unsigned PE_BEG = 4*c; localparam int unsigned PE_END = PE < 4*(c+1)? PE : 4*(c+1); + localparam int unsigned PE_REM = 4*(c+1) - PE_END; uwire [57:0] p3[SIMD]; uwire signed [ 1:0] h3[SIMD][3]; @@ -65,12 +66,12 @@ module mvu_4sx4u #( for(genvar pe = 0; pe < PE_END - PE_BEG; pe++) begin assign ww[pe] = w[PE_BEG + pe][s]; if(pe) begin - if(BEHAVIORAL) assign xx[pe] = zero? 0 : ww[pe] * a[s]; + if(BEHAVIORAL) assign xx[pe + PE_REM] = zero? 0 : ww[pe] * a[s]; `ifndef VERILATOR else begin LUT6_2 #(.INIT(64'h0000_6AC0_0000_8888)) lut_x ( - .O6(xx[pe][1]), - .O5(xx[pe][0]), + .O6(xx[pe + PE_REM][1]), + .O5(xx[pe + PE_REM][0]), .I5(1'b1), .I4(zero), .I3(ww[pe][1]), @@ -86,8 +87,8 @@ module mvu_4sx4u #( dd = '0; aa = '0; for(int unsigned pe = 0; pe < PE_END - PE_BEG; pe++) begin - dd[D[pe]+:3] = ww[pe]; - aa[D[pe]+ 3] = ww[pe][3]; + dd[D[pe + PE_REM]+:3] = ww[pe]; + aa[D[pe + PE_REM]+ 3] = ww[pe][3]; end end end : blkVectorize @@ -305,7 +306,7 @@ module mvu_4sx4u #( localparam int unsigned HI_WIDTH = ACCU_WIDTH - LO_WIDTH; // Conclusive high part accumulation - if(i < 3) begin : genHi + if(i >= PE_REM && i < 3) begin : genHi // Adder Tree across all SIMD high contributions, each from [-1:1] uwire signed [$clog2(1+SIMD):0] tree[2*SIMD-1]; for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = h3[s][i]; @@ -323,9 +324,12 @@ module mvu_4sx4u #( end assign hi4[i] = Hi4; end : genHi + else begin : genHiZero + assign hi4[i] = '0; + end : genHiZero // Conclusive low part accumulation - if(1) begin : blkLo + if(i >= PE_REM) begin : blkLo // Adder Tree across all SIMD low contributions localparam int unsigned ROOT_WIDTH = $clog2(1 + SIMD*(2**LO_WIDTH-1)); uwire [ROOT_WIDTH-1:0] tree[2*SIMD-1]; @@ -346,6 +350,9 @@ module mvu_4sx4u #( if(i == 3) assign up4 = Lo4; else assign lo4[i] = Lo4; end : blkLo + else begin : blkLoZero + assign lo4[i] = '0; + end : blkLoZero end @@ -363,7 +370,7 @@ module mvu_4sx4u #( // Output for(genvar pe = PE_BEG; pe < PE_END; pe++) begin - assign p[pe] = Res5[pe - PE_BEG]; + assign p[pe] = Res5[pe - PE_BEG + PE_REM]; end end : genPipes From 8d3247ccf7657aeb534147a5dd9511fa397d4eb2 Mon Sep 17 00:00:00 2001 From: Yaman Umuroglu Date: Wed, 24 May 2023 15:56:07 +0200 Subject: [PATCH 039/123] [rtlmvu] Avoid unintentional verilator metacomments --- finn-rtllib/mvu/mvu_4sx4u.sv | 2 +- finn-rtllib/mvu/mvu_8sx8u_dsp48.sv | 2 +- finn-rtllib/mvu/mvu_8sx9.sv | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/finn-rtllib/mvu/mvu_4sx4u.sv b/finn-rtllib/mvu/mvu_4sx4u.sv index 21594e46ac..9f101e8c29 100644 --- a/finn-rtllib/mvu/mvu_4sx4u.sv +++ b/finn-rtllib/mvu/mvu_4sx4u.sv @@ -19,7 +19,7 @@ module mvu_4sx4u #( output logic vld, output logic signed [PE-1:0][ACCU_WIDTH-1:0] p ); - // Verilator always to use behavioral code + // for verilator always use behavioral code localparam bit BEHAVIORAL = `ifdef VERILATOR 1 || diff --git a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv index bd1f813af6..6b54e91b6a 100644 --- a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv +++ b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv @@ -23,7 +23,7 @@ module mvu_8sx8u_dsp48 #( output logic vld, output logic signed [PE-1:0][ACCU_WIDTH-1:0] p ); - // Verilator always to use behavioral code + // for verilator always use behavioral code localparam bit BEHAVIORAL = `ifdef VERILATOR 1 || diff --git a/finn-rtllib/mvu/mvu_8sx9.sv b/finn-rtllib/mvu/mvu_8sx9.sv index f8e2ab3985..a601066cfd 100644 --- a/finn-rtllib/mvu/mvu_8sx9.sv +++ b/finn-rtllib/mvu/mvu_8sx9.sv @@ -57,7 +57,7 @@ module mvu_8sx9 #( output logic vld, output logic [PE-1:0][ACCU_WIDTH-1:0] p ); - // Verilator always to use behavioral code + // for verilator always use behavioral code localparam bit BEHAVIORAL = `ifdef VERILATOR 1 || From c8663505dcd2c2eeb3ddad05d361f82be32040eb Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Wed, 24 May 2023 17:14:23 +0100 Subject: [PATCH 040/123] [rtl mvu]: extension to allow selecting PE values that are not multiples of 2 --- finn-rtllib/mvu/mvu_8sx8u_dsp48.sv | 57 +++++++++++++++++------------- 1 file changed, 32 insertions(+), 25 deletions(-) diff --git a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv index 6b54e91b6a..5cc3fa4c49 100644 --- a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv +++ b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv @@ -54,6 +54,7 @@ module mvu_8sx8u_dsp48 #( localparam int unsigned PE_BEG = 2*c; localparam int unsigned PE_END = PE < 2*(c+1)? PE : 2*(c+1); + localparam int unsigned PE_RES = 2*(c+1) - PE_END; uwire [57:0] p3[SIMD]; uwire signed [ 1:0] h3[SIMD]; @@ -90,8 +91,8 @@ module mvu_8sx8u_dsp48 #( dd = '0; aa = '0; for(int unsigned pe = 0; pe < PE_END - PE_BEG; pe++) begin - dd[D[pe] +: WEIGHT_WIDTH-1] = ww[pe]; - aa[D[pe] + WEIGHT_WIDTH-1] = ww[pe][WEIGHT_WIDTH-1]; + dd[D[pe + PE_RES] +: WEIGHT_WIDTH-1] = ww[pe]; + aa[D[pe + PE_RES] + WEIGHT_WIDTH-1] = ww[pe][WEIGHT_WIDTH-1]; end end end : blkVectorize @@ -301,32 +302,35 @@ module mvu_8sx8u_dsp48 #( uwire signed [ACCU_WIDTH -1:0] up4; uwire signed [ACCU_WIDTH -SINGLE_PROD_WIDTH:0] hi4; uwire [$clog2(SIMD)+SINGLE_PROD_WIDTH-1:0] lo4; - for(genvar i = 0; i < 2; i++) begin - localparam int unsigned LO_WIDTH = D[i+1] - D[i]; - localparam int unsigned HI_WIDTH = ACCU_WIDTH - LO_WIDTH; - // Conclusive high part accumulation - if(i == 0) begin : genHi - // Adder Tree across all SIMD high contributions, each from [-1:1] - uwire signed [$clog2(1+SIMD):0] tree[2*SIMD-1]; - for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = h3[s]; - for(genvar n = 0; n < SIMD-1; n++) begin - // Sum truncated to actual maximum bit width at this node - uwire signed [$clog2(1+LEAVE_LOAD[n]):0] s = tree[2*n+1] + tree[2*n+2]; - assign tree[n] = s; - end + // Conclusive high part accumulation + if(PE_RES == 0) begin : genHi + localparam int unsigned HI_WIDTH = ACCU_WIDTH - D[1]; + // Adder Tree across all SIMD high contributions, each from [-1:1] + uwire signed [$clog2(1+SIMD):0] tree[2*SIMD-1]; + for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = h3[s]; + for(genvar n = 0; n < SIMD-1; n++) begin + // Sum truncated to actual maximum bit width at this node + uwire signed [$clog2(1+LEAVE_LOAD[n]):0] s = tree[2*n+1] + tree[2*n+2]; + assign tree[n] = s; + end - // High Sideband Accumulation - logic signed [HI_WIDTH-1:0] Hi4 = 0; - always_ff @(posedge clk) begin - if(rst) Hi4 <= 0; - else if(en) Hi4 <= (L[4]? 0 : Hi4) + tree[0]; - end - assign hi4 = Hi4; - end : genHi + // High Sideband Accumulation + logic signed [HI_WIDTH-1:0] Hi4 = 0; + always_ff @(posedge clk) begin + if(rst) Hi4 <= 0; + else if(en) Hi4 <= (L[4]? 0 : Hi4) + tree[0]; + end + assign hi4 = Hi4; + end : genHi + else begin : genHiZero + assign hi4 = '0; + end : genHiZero + for(genvar i = 0; i < 2; i++) begin + localparam int unsigned LO_WIDTH = D[i+1] - D[i]; // Conclusive low part accumulation - if(1) begin : blkLo + if(i >= PE_RES) begin : blkLo // Adder Tree across all SIMD low contributions localparam int unsigned ROOT_WIDTH = $clog2(1 + SIMD*(2**LO_WIDTH-1)); uwire [ROOT_WIDTH-1:0] tree[2*SIMD-1]; @@ -347,6 +351,9 @@ module mvu_8sx8u_dsp48 #( if(i == 1) assign up4 = Lo4; else assign lo4 = Lo4; end : blkLo + else begin : blkLoZero + assign lo4 = '0; + end : blkLoZero end @@ -362,7 +369,7 @@ module mvu_8sx8u_dsp48 #( // Output for(genvar pe = PE_BEG; pe < PE_END; pe++) begin - assign p[pe] = Res5[pe - PE_BEG]; + assign p[pe] = Res5[pe - PE_BEG + PE_RES]; end end : genPipes From fd1e038c643c05199b38320f8815f430e538d936 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Wed, 24 May 2023 17:21:56 +0100 Subject: [PATCH 041/123] [rtl mvu axi]: updated comments on folding hints --- finn-rtllib/mvu/mvu_axi.sv | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/finn-rtllib/mvu/mvu_axi.sv b/finn-rtllib/mvu/mvu_axi.sv index a181f54ac5..cef55949ed 100644 --- a/finn-rtllib/mvu/mvu_axi.sv +++ b/finn-rtllib/mvu/mvu_axi.sv @@ -31,12 +31,13 @@ * @brief Matrix Vector Unit (MVU) AXI-lite interface wrapper. * @details * Folding hints: - * - 4-bit MVU: PE scaling should aim at a full multiple of 4. - * - 8-bit MVU - DSP48: PE scaling should aim at a full multiple of 2. - * - 8-bit MVU - DSP58: SIMD scaling should aim at a full multiple of 3. + * - 4-bit MVU: PE scaling should divide MH. + * - 8-bit MVU - DSP48: PE scaling should divide MH. + * - 8-bit MVU - DSP58: SIMD scaling should aim at a full multiple of 3 and divide MW. * - Otherwise, keep SIMD and PE somewhat balanced. SIMD scaling tends to * impact critical paths more than PE scaling. PE scaling implies a * bigger fanout on the input activations. + * - Full unfolding along MH (PE=MH) results in no replay buffer instantiated *****************************************************************************/ module mvu_axi #( From f60d4c6fa105bd29689b93aafd880ec92c32358c Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 2 Jun 2023 11:48:26 +0100 Subject: [PATCH 042/123] [rtl custom op]: minor fixes to codegen --- .../fpgadataflow/matrixvectoractivation_rtl.py | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py index 1791327e78..9f8130806b 100644 --- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py +++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py @@ -612,11 +612,7 @@ def generate_params(self, model, path): code_gen_dir = path # weights, if not external weights = model.get_initializer(self.onnx_node.input[1]) -<<<<<<< HEAD - if mem_mode in ["decoupled", "external"]: -======= if mem_mode == "decoupled" or mem_mode == "external": ->>>>>>> 72fe4c5b ([rtlmvu] More fixes for memstream and param gen) weight_filename_sim = "{}/weights.npy".format(code_gen_dir) # save decoupled weights for cppsim self.make_weight_file(weights, "decoupled_npy", weight_filename_sim) @@ -909,8 +905,6 @@ def code_generation_ipi(self): self.onnx_node.name, ) ) - cmd.append("set_property CONFIG.FREQ_HZ 333333333.333333 [get_bd_intf_pins %s/in0_V]" % (self.onnx_node.name)) - cmd.append("set_property CONFIG.FREQ_HZ 333333333.333333 [get_bd_intf_pins %s/out_V]" % (self.onnx_node.name)) else: raise Exception("Unrecognized mem_mode for MatrixVectorActivation") return cmd @@ -968,8 +962,7 @@ def derive_characteristic_fxns(self, period): # TODO: characterize max_clk and implement this function in look-up style def _resolve_segment_len(self, clk): # Insert pipeline registers in the DSP chain to meet target clock frequency - segmentlen = 0 - return segmentlen + return 4 # default to 4 for now def _resolve_impl_style(self, fpgapart): # Based on target device and activation/weight-width, choose the @@ -1002,11 +995,6 @@ def generate_hdl(self, model, fpgapart, clk): # (e.g. by GiveUniqueNodeNames(prefix) during MakeZynqProject) self.set_nodeattr("gen_top_module", self.get_verilog_top_module_name()) - ram_style = self.get_nodeattr("ram_style") - assert ( - ram_style == "auto" - ), "Unrecognized ram_style for MatrixVectorActivation_rtl" - # apply code generation to template with open(template_path, "r") as f: template_wrapper = f.read() From a1ad304a42bf89b36d6507cf9f749a7a1a7d130a Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 2 Jun 2023 11:48:58 +0100 Subject: [PATCH 043/123] [specialize-to-rtl]: add ram_style and rt_writeable_weights support --- .../transformation/fpgadataflow/specialize_to_rtl_layers.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py b/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py index 7d677ec216..23b6e59abe 100644 --- a/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py +++ b/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py @@ -74,6 +74,8 @@ def apply(self, model): simd = getCustomOp(n).get_nodeattr("SIMD") pe = getCustomOp(n).get_nodeattr("PE") mem_mode = getCustomOp(n).get_nodeattr("mem_mode") + ram_style = getCustomOp(n).get_nodeattr("ram_style") + runtime_writeable_weights = getCustomOp(n).get_nodeattr("runtime_writeable_weights") new_node = helper.make_node( "MatrixVectorActivation_rtl", @@ -91,6 +93,8 @@ def apply(self, model): numInputVectors=numInputVectors, mem_mode=mem_mode, name=n.name + "_rtl", + ram_style=ram_style, + runtime_writeable_weights=runtime_writeable_weights ) graph.node.insert(node_ind, new_node) # remove old node From 2cbb68fe016ff7ea292ffa071741b352222d1a4c Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 2 Jun 2023 11:50:05 +0100 Subject: [PATCH 044/123] [rtllib]: change string type to parameter type due to Vivado error --- finn-rtllib/mvu/mvu_axi.sv | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/finn-rtllib/mvu/mvu_axi.sv b/finn-rtllib/mvu/mvu_axi.sv index cef55949ed..46167af95b 100644 --- a/finn-rtllib/mvu/mvu_axi.sv +++ b/finn-rtllib/mvu/mvu_axi.sv @@ -51,7 +51,7 @@ module mvu_axi #( bit SIGNED_ACTIVATIONS = 0, int unsigned SEGMENTLEN = 0, bit FORCE_BEHAVIORAL = 0, - string MVU_IMPL_STYLE, + parameter MVU_IMPL_STYLE, // string type causes error in Vivado localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8, localparam int unsigned INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8, @@ -163,12 +163,11 @@ module mvu_axi #( "mvu_8sx8u_dsp48": mvu_8sx8u_dsp48 #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), - .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( + .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( .clk, .rst, .en, .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau), .vld(ovld), .p(odat) ); - default: initial begin $error("Unrecognized MVU_IMPL_STYLE '%s'", MVU_IMPL_STYLE); $finish; From 92eb0edba2d059b8b170ed7e6d8ac7a224c9208c Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 2 Jun 2023 11:51:40 +0100 Subject: [PATCH 045/123] [rtllib]: renamed variable for consistency --- finn-rtllib/mvu/mvu_8sx8u_dsp48.sv | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv index 5cc3fa4c49..3cd9cef560 100644 --- a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv +++ b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv @@ -54,7 +54,7 @@ module mvu_8sx8u_dsp48 #( localparam int unsigned PE_BEG = 2*c; localparam int unsigned PE_END = PE < 2*(c+1)? PE : 2*(c+1); - localparam int unsigned PE_RES = 2*(c+1) - PE_END; + localparam int unsigned PE_REM = 2*(c+1) - PE_END; uwire [57:0] p3[SIMD]; uwire signed [ 1:0] h3[SIMD]; @@ -91,8 +91,8 @@ module mvu_8sx8u_dsp48 #( dd = '0; aa = '0; for(int unsigned pe = 0; pe < PE_END - PE_BEG; pe++) begin - dd[D[pe + PE_RES] +: WEIGHT_WIDTH-1] = ww[pe]; - aa[D[pe + PE_RES] + WEIGHT_WIDTH-1] = ww[pe][WEIGHT_WIDTH-1]; + dd[D[pe + PE_REM] +: WEIGHT_WIDTH-1] = ww[pe]; + aa[D[pe + PE_REM] + WEIGHT_WIDTH-1] = ww[pe][WEIGHT_WIDTH-1]; end end end : blkVectorize @@ -304,7 +304,7 @@ module mvu_8sx8u_dsp48 #( uwire [$clog2(SIMD)+SINGLE_PROD_WIDTH-1:0] lo4; // Conclusive high part accumulation - if(PE_RES == 0) begin : genHi + if(PE_REM == 0) begin : genHi localparam int unsigned HI_WIDTH = ACCU_WIDTH - D[1]; // Adder Tree across all SIMD high contributions, each from [-1:1] uwire signed [$clog2(1+SIMD):0] tree[2*SIMD-1]; @@ -330,7 +330,7 @@ module mvu_8sx8u_dsp48 #( for(genvar i = 0; i < 2; i++) begin localparam int unsigned LO_WIDTH = D[i+1] - D[i]; // Conclusive low part accumulation - if(i >= PE_RES) begin : blkLo + if(i >= PE_REM) begin : blkLo // Adder Tree across all SIMD low contributions localparam int unsigned ROOT_WIDTH = $clog2(1 + SIMD*(2**LO_WIDTH-1)); uwire [ROOT_WIDTH-1:0] tree[2*SIMD-1]; @@ -369,7 +369,7 @@ module mvu_8sx8u_dsp48 #( // Output for(genvar pe = PE_BEG; pe < PE_END; pe++) begin - assign p[pe] = Res5[pe - PE_BEG + PE_RES]; + assign p[pe] = Res5[pe - PE_BEG + PE_REM]; end end : genPipes From 471a221b975e549e462e7ff9488c65ad182fe278 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= Date: Fri, 2 Jun 2023 12:39:14 +0100 Subject: [PATCH 046/123] Fix improper blocking assignment & linting. --- finn-rtllib/mvu/tb/mvu_axi_tb.sv | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/finn-rtllib/mvu/tb/mvu_axi_tb.sv b/finn-rtllib/mvu/tb/mvu_axi_tb.sv index ef5fa7d682..b89b58f55b 100644 --- a/finn-rtllib/mvu/tb/mvu_axi_tb.sv +++ b/finn-rtllib/mvu/tb/mvu_axi_tb.sv @@ -42,12 +42,12 @@ module mvu_axi_tb(); localparam int unsigned SEGMENTLEN = 2; localparam string MVU_IMPL_STYLE = "mvu_8sx8u_dsp48"; localparam bit FORCE_BEHAVIORAL = 1; - // Bit-width config + // Bit-width config localparam int unsigned ACTIVATION_WIDTH = 8; localparam int unsigned WEIGHT_WIDTH = 8; localparam int unsigned ACCU_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW); localparam bit SIGNED_ACTIVATIONS = 0; - // Simulation constants + // Simulation constants localparam int unsigned NF = MH/PE; localparam int unsigned SF = MW/SIMD; localparam int unsigned NUM_OF_DSP = SIMD/3; @@ -57,7 +57,7 @@ module mvu_axi_tb(); localparam int unsigned ACTIVATION_WIDTH_BA_DELTA = ACTIVATION_WIDTH_BA - SIMD*ACTIVATION_WIDTH; localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8; - // Generate clk and reset signal + // Generate clk and reset signal logic clk = 0; always #5ns clk = !clk; @@ -69,7 +69,7 @@ module mvu_axi_tb(); uwire ap_clk = clk; - // Generate activations + // Generate activations typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t; typedef activation_t activation_vector_t[SF]; @@ -94,8 +94,8 @@ module mvu_axi_tb(); for (int i=0; i= 1; + do begin + activations.vld <= $urandom()%7 >= 1; @(posedge clk); end while (!(activations.vld === 1 && activations.rdy === 1)); end @@ -104,9 +104,9 @@ module mvu_axi_tb(); activations.dat <= 'x; end - // Generate weights + // Generate weights typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t; - typedef weight_t weight_matrix_t[NF][SF]; + typedef weight_t weight_matrix_t[NF][SF]; function weight_matrix_t init_WEIGHTS; automatic weight_matrix_t res; @@ -139,7 +139,7 @@ module mvu_axi_tb(); weights.dat <= 'x; end - // Function to compute golden output + // Function to compute golden output // a: [SF][SIMD-1:0][ACTIVATION_WIDTH-1:0] // w: [NF][SF][PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] typedef logic signed [PE-1:0][ACCU_WIDTH-1:0] output_t; @@ -155,12 +155,12 @@ module mvu_axi_tb(); automatic output_vector_t res = '{default: 0}; for (int j = 0; j>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); - else begin + else begin $error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); $stop; - end + end end - + NF_CNT += 1; end - $finish; + $finish; end // Instantiate DUT @@ -211,5 +211,5 @@ module mvu_axi_tb(); .s_axis_input_tready(activations.rdy), .m_axis_output_tdata(outputs.dat), .m_axis_output_tvalid(outputs.vld), .m_axis_output_tready(outputs.rdy) ); - + endmodule : mvu_axi_tb From 5c5dc09c98d4e1a07a7e4cae17ca358b197a57c8 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 2 Jun 2023 13:35:04 +0100 Subject: [PATCH 047/123] [test rtl mvu]: modified/extended test cases --- tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py b/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py index 20a249bd08..3db7a718f5 100644 --- a/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py +++ b/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py @@ -86,13 +86,12 @@ def prepare_inputs(input_tensor): return {"inp": input_tensor} @pytest.mark.parametrize("mh", [16]) -@pytest.mark.parametrize("mw", [90]) -#@pytest.mark.parametrize("pe", [1, 2, 4, 8, 16]) -@pytest.mark.parametrize("pe", [16]) +@pytest.mark.parametrize("mw", [32]) +@pytest.mark.parametrize("pe", [1, 4, 16]) #@pytest.mark.parametrize("simd", [1, 30, 90]) -@pytest.mark.parametrize("simd", [90]) -@pytest.mark.parametrize("idt", [DataType["INT8"]]) -@pytest.mark.parametrize("wdt", [DataType["UINT4"]]) +@pytest.mark.parametrize("simd", [1, 4, 32]) +@pytest.mark.parametrize("idt", [DataType["UINT4"], DataType["UINT8"]]) +@pytest.mark.parametrize("wdt", [DataType["INT4"], DataType["INT6"]]) #@pytest.mark.parametrize("part", ["xcvm1802-vsvd1760-2MP-e-S", "xcku3p-ffva676-1-e"]) @pytest.mark.parametrize("part", ["xcvm1802-vsvd1760-2MP-e-S"]) @pytest.mark.parametrize("segmentlen", [1]) @@ -166,7 +165,3 @@ def test_fpgadataflow_mvau_rtl(mh, mw, pe, simd, idt, wdt, part, segmentlen): assert (output_mvau_hls == output_mvau_rtl).all() assert (output_mvau_hls.size > 0) - - -# python setup.py test --addopts "-k test_fpgadataflow_mvau_rtl" -# python setup.py test --addopts "-k test_fpgadataflow_fclayer_rtlsim" \ No newline at end of file From b4eb9b69a8a6920fdb3141752395e672f78479e3 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 30 Jun 2023 15:36:17 +0100 Subject: [PATCH 048/123] [rtl mvu]: updated DSP58 >4-bit variant to lift SIMD%3==0 restriction --- finn-rtllib/mvu/mvu_8sx9.sv | 103 +++++++++++++++++++++++------------- 1 file changed, 65 insertions(+), 38 deletions(-) diff --git a/finn-rtllib/mvu/mvu_8sx9.sv b/finn-rtllib/mvu/mvu_8sx9.sv index a601066cfd..439fbc44f9 100644 --- a/finn-rtllib/mvu/mvu_8sx9.sv +++ b/finn-rtllib/mvu/mvu_8sx9.sv @@ -92,77 +92,95 @@ module mvu_8sx9 #( if (rst) Z <= '{default: 0}; else if(en) begin Z[0] <= zero; - if (MAX_PIPELINE_STAGES > 2) Z[1:MAX_PIPELINE_STAGES-1] <= Z[0:MAX_PIPELINE_STAGES-2]; + if (MAX_PIPELINE_STAGES > 2) Z[1:MAX_PIPELINE_STAGES-2] <= Z[0:MAX_PIPELINE_STAGES-3]; end end end; //-------------------- Buffer for input activations --------------------\\ localparam int unsigned PAD_BITS_ACT = 9 - ACTIVATION_WIDTH; - typedef logic [2:0][ACTIVATION_WIDTH-1:0] a_buffer_t; for (genvar i=0; i1 ? TOTAL_PREGS-1 : 0; + localparam int LANES_OCCUPIED = i == CHAINLEN-1 ? SIMD - 3*i : 3; if (EXTERNAL_PREGS > 0) begin : genExternalPregAct - a_buffer_t A [0:EXTERNAL_PREGS-1]; + logic [0:EXTERNAL_PREGS-1][LANES_OCCUPIED-1:0][ACTIVATION_WIDTH-1:0] A = '{ default : 0}; always_ff @(posedge clk) begin if (rst) A <= '{default: 0}; else if(en) begin - A[EXTERNAL_PREGS-1] <= a[3*i +: 3]; + A[EXTERNAL_PREGS-1] <= a[3*i +: LANES_OCCUPIED]; if (EXTERNAL_PREGS > 1) A[0:EXTERNAL_PREGS-2] <= A[1:EXTERNAL_PREGS-1]; end end - assign a_in_i[i][26:0] = SIGNED_ACTIVATIONS ? { {PAD_BITS_ACT{A[0][2][ACTIVATION_WIDTH-1]}}, A[0][2], {PAD_BITS_ACT{A[0][1][ACTIVATION_WIDTH-1]}}, A[0][1], {PAD_BITS_ACT{A[0][0][ACTIVATION_WIDTH-1]}}, A[0][0]} - : { {PAD_BITS_ACT{1'b0}}, A[0][2], {PAD_BITS_ACT{1'b0}}, A[0][1], {PAD_BITS_ACT{1'b0}}, A[0][0]} ; + for (genvar j=0; j1 ? TOTAL_PREGS-1 : 0; + localparam int LANES_OCCUPIED = j == CHAINLEN-1 ? SIMD - 3*j : 3; if (EXTERNAL_PREGS > 0) begin : genExternalPregWeight - b_buffer_t B [0:PE-1][0:EXTERNAL_PREGS-1]; + logic [0:PE-1][0:EXTERNAL_PREGS-1][LANES_OCCUPIED-1:0][WEIGHT_WIDTH-1:0] B = '{ default : 0}; always_ff @(posedge clk) begin if (rst) B <= '{default: 0}; else if (en) begin - B[j][EXTERNAL_PREGS-1] <= w[j][3*i +: 3]; - if (EXTERNAL_PREGS > 1) B[j][0:EXTERNAL_PREGS-2] <= B[j][1:EXTERNAL_PREGS-1]; + B[i][EXTERNAL_PREGS-1] <= w[i][3*j +: LANES_OCCUPIED]; + if (EXTERNAL_PREGS > 1) B[i][0:EXTERNAL_PREGS-2] <= B[i][1:EXTERNAL_PREGS-1]; end end - assign b_in_i[j][i][23:0] = { {PAD_BITS_WEIGHT{B[j][0][2][WEIGHT_WIDTH-1]}}, B[j][0][2], {PAD_BITS_WEIGHT{B[j][0][1][WEIGHT_WIDTH-1]}}, B[j][0][1], {PAD_BITS_WEIGHT{B[j][0][0][WEIGHT_WIDTH-1]}}, B[j][0][0] }; + for (genvar k = 0 ; k < LANES_OCCUPIED ; k++) begin : genBin + assign b_in_i[i][j][8*k +: 8] = PAD_BITS_WEIGHT == 0 ? B[i][0][k] : { {PAD_BITS_WEIGHT{B[i][0][k][WEIGHT_WIDTH-1]}}, B[i][0][k] }; + end : genBin + for (genvar k=LANES_OCCUPIED; k<3; k++) begin : genBinZero + assign b_in_i[i][j][8*k +: 8] = 8'b0; + end : genBinZero end : genExternalPregWeight else begin : genInpDSPWeight - assign b_in_i[j][i][23:0] = { {PAD_BITS_WEIGHT{w[j][3*i+2][WEIGHT_WIDTH-1]}}, w[j][3*i+2], {PAD_BITS_WEIGHT{w[j][3*i+1][WEIGHT_WIDTH-1]}}, w[j][3*i+1], {PAD_BITS_WEIGHT{w[j][3*i][WEIGHT_WIDTH-1]}}, w[j][3*i] }; + for (genvar k = 0; k < LANES_OCCUPIED; k++) begin : genBin + assign b_in_i[i][j][8*k +: 8] = PAD_BITS_WEIGHT == 0 ? w[i][3*j+k] : { {PAD_BITS_WEIGHT{w[i][3*j+k][WEIGHT_WIDTH-1]}}, w[i][3*j+k] }; + end : genBin + for (genvar k=LANES_OCCUPIED; k<3; k++) begin : genBinZero + assign b_in_i[i][j][8*k +: 8] = 8'b0; + end : genBinZero end : genInpDSPWeight end : genWeightSIMD - end : genWeightPE //-------------------- Instantiate PE x CHAINLEN DSPs --------------------\\ - for (genvar j=0; j0 ? 2 : 1; // 1 : 0 - localparam bit PREG = (i+1)%SEGLEN==0 || i == CHAINLEN-1; - localparam bit FIRST = i == 0; - localparam bit LAST = i == CHAINLEN-1; - uwire [57:0] pp; + localparam bit PREG = (j+1)%SEGLEN==0 || j == CHAINLEN-1; + localparam bit FIRST = j == 0; + localparam bit LAST = j == CHAINLEN-1; if (LAST) begin : genPOUT - assign p[j] = pp[ACCU_WIDTH-1:0]; + assign p[i] = pcout[i][j][ACCU_WIDTH-1:0]; end // Note: Since the product B * AD is computed, @@ -174,7 +192,7 @@ module mvu_8sx9 #( always_ff @(posedge clk) begin if (rst) Areg <= '{ default : 0}; else if (en) begin - Areg[0] <= { 7'bx, a_in_i[i] }; + Areg[0] <= { 7'bx, a_in_i[j] }; if (INTERNAL_PREGS == 2) Areg[1] <= Areg[0]; end end @@ -182,7 +200,7 @@ module mvu_8sx9 #( always_ff @(posedge clk) begin if (rst) Breg <= '{ default : 0}; else if (en) begin - Breg[0] <= b_in_i[j][i]; + Breg[0] <= b_in_i[i][j]; if (INTERNAL_PREGS == 2) Breg[1] <= Breg[0]; end end @@ -217,27 +235,36 @@ module mvu_8sx9 #( end else assign Preg = Mreg; end - else if (LAST) begin : genLast + else if (FIRST && LAST) begin : genSingle + always_ff @(posedge clk) begin + if (rst) Opmode <= 0; + else if (en) Opmode <= L[1]; + end + always_ff @(posedge clk) begin + if (rst) Preg <= 0; + else if (en) Preg <= (Opmode ? 0 : Preg) + Mreg; + end + end + else if (!FIRST && LAST) begin : genLast always_ff @(posedge clk) begin if (rst) Opmode <= 0; else if (en) Opmode <= L[1]; end always_ff @(posedge clk) begin if (rst) Preg <= 0; - else if (en) Preg <= (Opmode ? 0 : Preg) + Mreg + pcout[j][i-1]; + else if (en) Preg <= (Opmode ? 0 : Preg) + Mreg + pcout[i][j-1]; end end else begin : genMid if (PREG) begin : genPregBehav always_ff @(posedge clk) begin if (rst) Preg <= 0; - else if (en) Preg <= Mreg + pcout[j][i-1]; + else if (en) Preg <= Mreg + pcout[i][j-1]; end end - else assign Preg = Mreg + pcout[j][i-1]; + else assign Preg = Mreg + pcout[i][j-1]; end - assign pp = Preg; - assign pcout[j][i] = pp; + assign pcout[i][j] = Preg; end : genBehav `ifndef VERILATOR else begin: genDSP @@ -307,7 +334,7 @@ module mvu_8sx9 #( .BCOUT(), // 24-bit output: B cascade .CARRYCASCOUT(), // 1-bit output: Cascade carry .MULTSIGNOUT(), // 1-bit output: Multiplier sign cascade - .PCOUT(pcout[j][i]), // 58-bit output: Cascade output + .PCOUT(pcout[i][j]), // 58-bit output: Cascade output // Control outputs: Control Inputs/Status Bits .OVERFLOW(), // 1-bit output: Overflow in add/acc .PATTERNBDETECT(), // 1-bit output: Pattern bar detect @@ -322,7 +349,7 @@ module mvu_8sx9 #( .BCIN('x), // 24-bit input: B cascade .CARRYCASCIN('x), // 1-bit input: Cascade carry .MULTSIGNIN('x), // 1-bit input: Multiplier sign cascade - .PCIN(FIRST ? 'x : pcout[j][i-1]), // 58-bit input: P cascade + .PCIN(FIRST ? 'x : pcout[i][j-1]), // 58-bit input: P cascade // Control inputs: Control Inputs/Status Bits .ALUMODE(4'h0), // 4-bit input: ALU control .CARRYINSEL('0), // 3-bit input: Carry select @@ -339,8 +366,8 @@ module mvu_8sx9 #( 7'b000_0000 }), // 9-bit input: Operation mode // Data inputs: Data Ports - .A({ 7'bx, a_in_i[i] }), // 34-bit input: A data - .B(b_in_i[j][i]), // 24-bit input: B data + .A({ 7'bx, a_in_i[j] }), // 34-bit input: A data + .B(b_in_i[i][j]), // 24-bit input: B data .C('x), // 58-bit input: C data .CARRYIN('0), // 1-bit input: Carry-in .D('x), // 27-bit input: D data From ad63673cda849ecf0df993bc83d00e676998ab03 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 30 Jun 2023 15:45:26 +0100 Subject: [PATCH 049/123] [rtl mvu]: bug fix for SIMD=1 init_leave_loads --- finn-rtllib/mvu/mvu_4sx4u.sv | 2 +- finn-rtllib/mvu/mvu_8sx8u_dsp48.sv | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/finn-rtllib/mvu/mvu_4sx4u.sv b/finn-rtllib/mvu/mvu_4sx4u.sv index 4674576d23..ac95b5f8a9 100644 --- a/finn-rtllib/mvu/mvu_4sx4u.sv +++ b/finn-rtllib/mvu/mvu_4sx4u.sv @@ -296,7 +296,7 @@ module mvu_4sx4u #( // Stage #4: Cross-SIMD Reduction // Count leaves reachable from each node - localparam leave_load_t LEAVE_LOAD = init_leave_loads(); + localparam leave_load_t LEAVE_LOAD = SIMD > 1 ? init_leave_loads() : '{ default: 1}; // SIMD=1 requires no adder tree, so zero-ing out, otherwise init_leave_loads ends up in infinite loop uwire signed [ACCU_WIDTH -1:0] up4; uwire signed [ACCU_WIDTH -8:0] hi4[3]; diff --git a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv index 3cd9cef560..416c12c1cc 100644 --- a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv +++ b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv @@ -297,7 +297,7 @@ module mvu_8sx8u_dsp48 #( // Stage #4: Cross-SIMD Reduction // Count leaves reachable from each node - localparam leave_load_t LEAVE_LOAD = init_leave_loads(); + localparam leave_load_t LEAVE_LOAD = SIMD > 1 ? init_leave_loads() : '{ default: 0}; // SIMD=1 requires no adder tree, so zero-ing out, otherwise init_leave_loads ends up in infinite loop uwire signed [ACCU_WIDTH -1:0] up4; uwire signed [ACCU_WIDTH -SINGLE_PROD_WIDTH:0] hi4; From 79e8a5ef208f7bcdeafa231a5a3dff74177008c9 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 13 Jul 2023 18:34:05 +0100 Subject: [PATCH 050/123] [mvu rtl]: restrict index i to be less than 3 (within bounds of hi4) --- finn-rtllib/mvu/mvu_4sx4u.sv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/finn-rtllib/mvu/mvu_4sx4u.sv b/finn-rtllib/mvu/mvu_4sx4u.sv index ac95b5f8a9..88985312c9 100644 --- a/finn-rtllib/mvu/mvu_4sx4u.sv +++ b/finn-rtllib/mvu/mvu_4sx4u.sv @@ -324,7 +324,7 @@ module mvu_4sx4u #( end assign hi4[i] = Hi4; end : genHi - else begin : genHiZero + else if (i < 3) begin : genHiZero assign hi4[i] = '0; end : genHiZero From e3493c30529949a77a3f384fd75c030c551cd2cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= Date: Fri, 2 Jun 2023 12:47:53 +0100 Subject: [PATCH 051/123] Rewrite replay_buffer for input elasticity. --- finn-rtllib/mvu/replay_buffer.sv | 153 ++++++++++++++++++------- finn-rtllib/mvu/tb/replay_buffer_tb.sv | 130 +++++++++++++++++++++ 2 files changed, 242 insertions(+), 41 deletions(-) create mode 100644 finn-rtllib/mvu/tb/replay_buffer_tb.sv diff --git a/finn-rtllib/mvu/replay_buffer.sv b/finn-rtllib/mvu/replay_buffer.sv index 89bbbdb88f..3dfe72d6c6 100644 --- a/finn-rtllib/mvu/replay_buffer.sv +++ b/finn-rtllib/mvu/replay_buffer.sv @@ -1,5 +1,5 @@ /****************************************************************************** - * Copyright (C) 2022, Advanced Micro Devices, Inc. + * Copyright (C) 2022-2023, Advanced Micro Devices, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -51,60 +51,131 @@ module replay_buffer #( input logic ordy ); - typedef logic [$clog2(REP)+$clog2(LEN)-1:0] count_t; - count_t Count = 0; - uwire done_len = LEN == 1 ? 1 : ((LEN-1) & ~Count[$clog2(LEN)-1:0]) == 0; - uwire done_rep; - uwire done_all = done_len && done_rep; + if(LEN == 0) initial begin + $error("%m: Illegal zero sequence LEN."); + $finish; + end + if(REP == 0) initial begin + $error("%m: Illegal zero REP count."); + $finish; + end + // Track position in Sequence + uwire last_item; uwire shift; - uwire clr = rst || (done_all && shift); - always_ff @(posedge clk) begin - if(clr) Count <= 0; - else if(shift) Count <= Count + ((REP > 1) && done_len? 2**$clog2(LEN)-LEN+1 : 1); + if(LEN == 1) assign last_item = 1; + else begin + typedef logic [$clog2(LEN)-1:0] count_t; + count_t Count = 0; + logic Last = 0; + always_ff @(posedge clk) begin + if(rst) begin + Count <= 0; + Last <= 0; + end + else if(shift) begin + Count <= Count + (Last? 2**$clog2(LEN)-LEN+1 : 1); + Last <= (((LEN-2) & ~Count) == 0) && ((LEN&1) || !Last); + end + end + assign last_item = Last; end - typedef logic [W-1:0] data_t; - uwire data_t rdat; - uwire first_rep; if(REP == 1) begin - assign done_rep = 1; - assign first_rep = 1; - assign rdat = 'x; + assign shift = ivld && ordy; + + assign irdy = ordy; + assign odat = idat; + assign olast = last_item; + assign ofin = last_item; + assign ovld = ivld; end else begin - assign done_rep = ((REP-1) & ~Count[$left(Count):$clog2(LEN)]) == 0; - logic FirstRep = 1; + // Track Repetitions + uwire last_rep; + if(1) begin : blkRep + typedef logic [$clog2(REP)-1:0] rep_t; + rep_t RepCnt = 0; + logic RepLst = 0; + always_ff @(posedge clk) begin + if(rst) begin + RepCnt <= 0; + RepLst <= 0; + end + else if(last_item && shift) begin + RepCnt <= RepCnt + (RepLst? 2**$clog2(REP)-REP+1 : 1); + RepLst <= (((REP-2) & ~RepCnt) == 0) && ((REP&1) || !RepLst); + end + end + assign last_rep = RepLst; + end : blkRep + + localparam int unsigned AWIDTH = $clog2(LEN); + typedef logic [AWIDTH :0] ptr_t; // pointers with additional generational MSB + typedef logic [W -1:0] data_t; + + // Output Registers + data_t ODat; + logic OVld = 0; + logic OLst = 'x; + logic OFin = 'x; + assign odat = ODat; + assign olast = OLst; + assign ofin = OFin; + assign ovld = OVld; + + // Buffer Memory Management + data_t Mem[2**AWIDTH]; + ptr_t WP = 0; // Write Pointer + ptr_t RP = 0; // Read Pointer + ptr_t FP = 0; // Free Pointer + + // Operational Guards + // Occupancy: WP-FP + // WP-FP < 2**AWIDTH -> writing allowed + // - increments WP + // Availability: WP-RP + // WP-RP > 0 -> reading allowed + // - increments RP, last in sequence rewinds to FP for non-final repetition + // - increments FP in last repetition + assign irdy = !((WP-FP) >> AWIDTH); + + uwire wr = irdy && ivld; + uwire rd = !OVld || ordy; always_ff @(posedge clk) begin - if(clr) FirstRep <= 1; - else if(shift) FirstRep <= FirstRep && !done_len; + if(wr) Mem[WP[AWIDTH-1:0]] <= idat; + if(rd) ODat <= Mem[RP[AWIDTH-1:0]]; end - assign first_rep = FirstRep; - data_t Buf[LEN]; - if(LEN == 1) begin : genTrivial - always_ff @(posedge clk) begin - if(shift && FirstRep) Buf[0] <= idat; + uwire vld = (RP != WP); + assign shift = rd && vld; + always_ff @(posedge clk) begin + if(rst) begin + WP <= 0; + RP <= 0; + FP <= 0; + + OVld <= 0; + OLst <= 'x; + OFin <= 'x; end - end : genTrivial - else begin : genShift - always_ff @(posedge clk) begin - if(shift) begin - Buf[0] <= odat; - Buf[1:LEN-1] <= Buf[0:LEN-2]; + else begin + if(wr) WP <= WP + 1; + if(rd) begin + if(vld) begin + automatic logic rewind = last_item && !last_rep; + RP <= RP + (rewind? 2**(AWIDTH+1)-LEN+1 : 1); + FP <= FP + last_rep; + end + + OVld <= vld; + OLst <= last_item; + OFin <= last_rep && last_item; end end - end : genShift + end - assign rdat = Buf[LEN-1]; end - assign irdy = ordy && first_rep; - assign odat = first_rep? idat : rdat; - assign olast = done_len; - assign ofin = done_all; - assign ovld = first_rep? ivld : 1; - assign shift = ovld && ordy; - -endmodule : replay_buffer \ No newline at end of file +endmodule : replay_buffer diff --git a/finn-rtllib/mvu/tb/replay_buffer_tb.sv b/finn-rtllib/mvu/tb/replay_buffer_tb.sv new file mode 100644 index 0000000000..5581354e0e --- /dev/null +++ b/finn-rtllib/mvu/tb/replay_buffer_tb.sv @@ -0,0 +1,130 @@ +/****************************************************************************** + * Copyright (C) 2023, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Testbench for replay_buffer module. + * @author Thomas B. Preußer + *****************************************************************************/ + +module replay_buffer_tb; + + // Global Control + logic clk = 0; + always #5ns clk = !clk; + uwire rst = 0; + + // DUT Geometries + localparam int unsigned DIMS[3] = '{ 7, 8, 10 }; + localparam int unsigned W = 8; + typedef logic [W-1:0] data_t; + + bit [2**$size(DIMS)-1:0] done = 0; + always_comb begin + if(&done) begin + $display("Test completed."); + $finish; + end + end + + // Parallel DUT Instantiations + for(genvar r = 0; r < $size(DIMS); r++) begin + for(genvar l = 0; l < $size(DIMS); l++) begin + localparam int unsigned REP = DIMS[r]; + localparam int unsigned LEN = DIMS[l]; + + data_t idat; + logic ivld; + uwire irdy; + + uwire data_t odat; + uwire olast; + uwire ofin; + uwire ovld; + logic ordy; + + replay_buffer #(.LEN(LEN), .REP(REP), .W(W)) dut ( + .clk, .rst, + .idat, .ivld, .irdy, + .odat, .olast, .ofin, .ovld, .ordy + ); + + // Input Feed: 0, 1, ..., 10*LEN-1 + initial begin + idat = 'x; + ivld = 0; + @(posedge clk iff !rst); + + for(int unsigned i = 0; i < 10*LEN; i++) begin + idat <= i; + ivld <= 1; + @(posedge clk iff irdy); + idat <= 'x; + ivld <= 0; + while($urandom()%(REP-1) != 0) @(posedge clk); + end + end + + // Output Check + initial begin + automatic int unsigned base = 0; + + ordy = 0; + @(posedge clk iff !rst); + + for(int unsigned k = 0; k < 10; k++) begin + for(int unsigned j = 0; j < REP; j++) begin + for(int unsigned i = 0; i < LEN; i++) begin + ordy <= 1; + @(posedge clk iff ovld); + assert(odat == base+i) else begin + $error("#%0d.%0d: Data mismatch: %0d instead of %0d.", r, l, odat, base+i); + $stop; + end + assert(olast == (i == LEN-1)) else begin + $error("#%0d.%0d: Last mismatch.", r, l); + $stop; + end + assert(ofin == ((i == LEN-1) && (j == REP-1))) else begin + $error("#%0d.%0d: Fin mismatch.", r, l); + $stop; + end + + ordy <= 0; + while($urandom()%13 == 0) @(posedge clk); + end + end + base += LEN; + end + + done[$size(DIMS)*r + l] <= 1; + end + end + end + +endmodule : replay_buffer_tb From 2efba6854267873c84d58f6d8fe6b64f649eaa99 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Tue, 5 Sep 2023 13:53:01 +0100 Subject: [PATCH 052/123] [to-rtl]: Infer unique node names after transformation is applied --- .../transformation/fpgadataflow/specialize_to_rtl_layers.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py b/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py index 23b6e59abe..47ed5ce863 100644 --- a/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py +++ b/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py @@ -32,6 +32,7 @@ from onnx import helper from qonnx.transformation.infer_shapes import InferShapes from qonnx.transformation.infer_datatypes import InferDataTypes +from qonnx.transformation.general import GiveUniqueNodeNames from finn.transformation.fpgadataflow.minimize_accumulator_width import MinimizeAccumulatorWidth class InferRTLMatrixVectorActivation(Transformation): @@ -105,5 +106,6 @@ def apply(self, model): model = model.transform(MinimizeAccumulatorWidth()) model = model.transform(InferShapes()) model = model.transform(InferDataTypes()) + model = model.transform(GiveUniqueNodeNames()) return (model, graph_modified) \ No newline at end of file From 114ea1bfed2dd2f14196f98aea97d6cac9d1d57e Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 18 Sep 2023 14:56:07 +0100 Subject: [PATCH 053/123] [mvu rtl]: add synthesis directive to handle 'X in simulation --- finn-rtllib/mvu/mvu_8sx9.sv | 34 ++++++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/finn-rtllib/mvu/mvu_8sx9.sv b/finn-rtllib/mvu/mvu_8sx9.sv index 439fbc44f9..34aa856b1b 100644 --- a/finn-rtllib/mvu/mvu_8sx9.sv +++ b/finn-rtllib/mvu/mvu_8sx9.sv @@ -110,13 +110,17 @@ module mvu_8sx9 #( always_ff @(posedge clk) begin if (rst) A <= '{default: 0}; else if(en) begin - A[EXTERNAL_PREGS-1] <= a[3*i +: LANES_OCCUPIED]; + A[EXTERNAL_PREGS-1] <= +// synthesis translate_off + zero ? '1 : +// synthesis translate_on + a[3*i +: LANES_OCCUPIED]; if (EXTERNAL_PREGS > 1) A[0:EXTERNAL_PREGS-2] <= A[1:EXTERNAL_PREGS-1]; end end for (genvar j=0; j 1) B[i][0:EXTERNAL_PREGS-2] <= B[i][1:EXTERNAL_PREGS-1]; end end @@ -161,7 +173,11 @@ module mvu_8sx9 #( end : genExternalPregWeight else begin : genInpDSPWeight for (genvar k = 0; k < LANES_OCCUPIED; k++) begin : genBin - assign b_in_i[i][j][8*k +: 8] = PAD_BITS_WEIGHT == 0 ? w[i][3*j+k] : { {PAD_BITS_WEIGHT{w[i][3*j+k][WEIGHT_WIDTH-1]}}, w[i][3*j+k] }; + assign b_in_i[i][j][8*k +: 8] = +// synthesis translate_off + zero ? '1 : +// synthesis translate_on + PAD_BITS_WEIGHT == 0 ? w[i][3*j+k] : { {PAD_BITS_WEIGHT{w[i][3*j+k][WEIGHT_WIDTH-1]}}, w[i][3*j+k] }; end : genBin for (genvar k=LANES_OCCUPIED; k<3; k++) begin : genBinZero assign b_in_i[i][j][8*k +: 8] = 8'b0; @@ -178,9 +194,10 @@ module mvu_8sx9 #( localparam bit PREG = (j+1)%SEGLEN==0 || j == CHAINLEN-1; localparam bit FIRST = j == 0; localparam bit LAST = j == CHAINLEN-1; + uwire [57:0] pp; if (LAST) begin : genPOUT - assign p[i] = pcout[i][j][ACCU_WIDTH-1:0]; + assign p[i] = pp[ACCU_WIDTH-1:0]; end // Note: Since the product B * AD is computed, @@ -264,6 +281,7 @@ module mvu_8sx9 #( end else assign Preg = Mreg + pcout[i][j-1]; end + assign pp = Preg; assign pcout[i][j] = Preg; end : genBehav `ifndef VERILATOR From 79fafdb25a8707f740a0a7e21aa4f55ef7101882 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 18 Sep 2023 15:06:36 +0100 Subject: [PATCH 054/123] [replay buffer rtl]: minor fix to when LEN=1 (= AWIDTH=0) --- finn-rtllib/mvu/replay_buffer.sv | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/finn-rtllib/mvu/replay_buffer.sv b/finn-rtllib/mvu/replay_buffer.sv index 3dfe72d6c6..942f1823ca 100644 --- a/finn-rtllib/mvu/replay_buffer.sv +++ b/finn-rtllib/mvu/replay_buffer.sv @@ -144,8 +144,8 @@ module replay_buffer #( uwire wr = irdy && ivld; uwire rd = !OVld || ordy; always_ff @(posedge clk) begin - if(wr) Mem[WP[AWIDTH-1:0]] <= idat; - if(rd) ODat <= Mem[RP[AWIDTH-1:0]]; + if(wr) Mem[WP[AWIDTH:0]] <= idat; + if(rd) ODat <= Mem[RP[AWIDTH:0]]; end uwire vld = (RP != WP); From 619d9db0d5872d1afd72b1d1df841e1f87a9f33a Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 18 Sep 2023 15:09:45 +0100 Subject: [PATCH 055/123] [mvu lut]: LUT-based MVU compute core --- finn-rtllib/mvu/mvu_lut.sv | 102 +++++++++++++++++++++++++++++++++++++ 1 file changed, 102 insertions(+) create mode 100644 finn-rtllib/mvu/mvu_lut.sv diff --git a/finn-rtllib/mvu/mvu_lut.sv b/finn-rtllib/mvu/mvu_lut.sv new file mode 100644 index 0000000000..b100a589e8 --- /dev/null +++ b/finn-rtllib/mvu/mvu_lut.sv @@ -0,0 +1,102 @@ +module mvu_lut #( + int unsigned PE, + int unsigned SIMD, + int unsigned ACCU_WIDTH, + int unsigned ACTIVATION_WIDTH, + int unsigned WEIGHT_WIDTH, + bit SIGNED_ACTIVATIONS, + bit M_REG = 1, + + localparam unsigned MULT_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH +)( + // Global Control + input logic clk, + input logic rst, + input logic en, + + // Input + input logic last, + input logic zero, // ignore current inputs and force this partial product to zero + input logic signed [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] w, // signed weights + input logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] a, // (un)signed activations + + // Ouput + output logic vld, + output logic signed [PE-1:0][ACCU_WIDTH-1:0] p +); + + typedef int unsigned leave_load_t[2*SIMD-1]; + function leave_load_t init_leave_loads(); + automatic leave_load_t res; + for(int i = 2*(SIMD-1); i >= int'(SIMD)-1; i--) res[i] = 1; + for(int i = SIMD-2; i >= 0; i--) res[i] = res[2*i+1] + res[2*i+2]; + return res; + endfunction : init_leave_loads + + // Pipeline for last indicator flag + uwire last_i; + generate if (M_REG) begin + logic [0:1] L = '0; + always_ff @(posedge clk) begin + if(rst) L <= '0; + else if (en) L <= {last, L[0]}; + end + assign last_i = L[1]; + end + else begin + logic L = '0; + always_ff @(posedge clk) begin + if(rst) L <= '0; + else if (en) L <= last; + end + assign last_i = L; + end + endgenerate + + // For each PE generate + for (genvar i = 0; i < PE; i++) begin : genPE + // Stage #1: SIMD multipliers in parallel + uwire [MULT_WIDTH-1 : 0] m1 [SIMD]; + for (genvar j = 0; j < SIMD; j++) begin : genSIMD + if (M_REG) begin : genMreg + logic [MULT_WIDTH-1 : 0] M [SIMD]; + always_ff @(posedge clk) begin + if(rst) M[j] = '{ default : 0 }; + else if (en) M[j] = zero ? 0 : + SIGNED_ACTIVATIONS ? $signed(a[j]) * $signed(w[i][j]) : + $signed({1'b0, a[j]}) * $signed(w[i][j]); + // (SIGNED_ACTIVATIONS ? $signed(a[j]) : a[j]) * $signed(w[i][j]) isn't valid -- leads to unsigned multiplication + end + assign m1[j] = M[j]; + end : genMreg + else begin : genNoMreg + assign m1[j] = zero ? 0 : + SIGNED_ACTIVATIONS ? $signed(a[j]) * $signed(w[i][j]) : + $signed({1'b0, a[j]}) * $signed(w[i][j]); + end : genNoMreg + end : genSIMD + + // Stage #2: Adder tree to reduce SIMD products + localparam leave_load_t LEAVE_LOAD = SIMD > 1 ? init_leave_loads() : '{ default : 1 }; + localparam int unsigned ROOT_WIDTH = $clog2(SIMD*(2**MULT_WIDTH-1)); + uwire signed [2*SIMD-2:0][ROOT_WIDTH-1:0] tree; + for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = $signed(m1[s]); + for(genvar n = 0; n < SIMD-1; n++) begin + // Sum truncated to actual maximum bit width at this node + localparam int unsigned NODE_WIDTH = $clog2(LEAVE_LOAD[n]*(2**MULT_WIDTH-1)); + uwire signed [NODE_WIDTH-1:0] s = tree[2*n+1] + tree[2*n+2]; + assign tree[n] = s; + end + + // Stage #3: Buffer output + logic [ACCU_WIDTH-1:0] P2 [PE]; + always_ff @(posedge clk) begin + if(rst) P2[i] = '{ default : 0}; + else if (en) P2[i] = (last_i ? 0 : $signed(P2[i])) + $signed(tree[0]); + end + + assign vld = last_i; + assign p[i] = P2[i]; + end : genPE + +endmodule : mvu_lut From 090f2ac4adf4b0523b23b27fce05f7422269d72a Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Tue, 19 Sep 2023 12:23:55 +0100 Subject: [PATCH 056/123] [custom op]: add preferred_backend attribute --- src/finn/custom_op/fpgadataflow/matrixvectoractivation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py index 73d39ce642..4f24d71ccc 100644 --- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py +++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py @@ -123,7 +123,7 @@ def get_nodeattr_types(self): # weight data from the weight FIFOs. "runtime_writeable_weights": ("i", False, 0, {0, 1}), # Flag to specify whether RTL-based or HLS-based implementation is preferred - "impl": ("s", False, "rtl", {"hls", "rtl"}) + "preferred_backend": ("s", False, "rtl", {"hls", "rtl"}) } my_attrs.update(super().get_nodeattr_types()) return my_attrs From ac5e82d9944f5b7475eb13546affd1bc03d57f4a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= Date: Thu, 21 Sep 2023 13:03:27 +0100 Subject: [PATCH 057/123] Ensure a minimum of two buffer slots even for length-1 sequences. --- finn-rtllib/mvu/replay_buffer.sv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/finn-rtllib/mvu/replay_buffer.sv b/finn-rtllib/mvu/replay_buffer.sv index 942f1823ca..d4342f705c 100644 --- a/finn-rtllib/mvu/replay_buffer.sv +++ b/finn-rtllib/mvu/replay_buffer.sv @@ -111,7 +111,7 @@ module replay_buffer #( assign last_rep = RepLst; end : blkRep - localparam int unsigned AWIDTH = $clog2(LEN); + localparam int unsigned AWIDTH = LEN < 2? 1 : $clog2(LEN); typedef logic [AWIDTH :0] ptr_t; // pointers with additional generational MSB typedef logic [W -1:0] data_t; From 85156935163fc803d453db5ce2c1c5163808bc9f Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 21 Sep 2023 15:07:12 +0100 Subject: [PATCH 058/123] [rtl mvu wrapper]: support for vvu layer and rename --- finn-rtllib/mvu/mvu_vvu_axi_wrapper.v | 92 +++++++++++++++++++++++++++ 1 file changed, 92 insertions(+) create mode 100644 finn-rtllib/mvu/mvu_vvu_axi_wrapper.v diff --git a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v new file mode 100644 index 0000000000..6dbf82cb7b --- /dev/null +++ b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v @@ -0,0 +1,92 @@ +/****************************************************************************** + * Copyright (C) 2022, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Verilog AXI-lite wrapper for MVU. + *****************************************************************************/ + +module $MODULE_NAME_AXI_WRAPPER$ #( + parameter IS_MVU = "$IS_MVU$", + parameter COMPUTE_CORE = "$COMPUTE_CORE$", + parameter MW = $MW$, + parameter MH = $MH$, + parameter PE = $PE$, + parameter SIMD = $SIMD$, + parameter ACTIVATION_WIDTH = $ACTIVATION_WIDTH$, + parameter WEIGHT_WIDTH = $WEIGHT_WIDTH$, + parameter ACCU_WIDTH = $ACCU_WIDTH$, + parameter SIGNED_ACTIVATIONS = $SIGNED_ACTIVATIONS$, + parameter SEGMENTLEN = $SEGMENTLEN$, + parameter FORCE_BEHAVIORAL = $FORCE_BEHAVIORAL$, + + // Safely deducible parameters + parameter WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8, + parameter INPUT_STREAM_WIDTH_BA = ((IS_MVU == 1 ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8, + parameter OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8 +)( + // Global Control + (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF weights_V:in0_V:out_V, ASSOCIATED_RESET ap_rst_n" *) + (* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk CLK" *) + input ap_clk, + (* X_INTERFACE_PARAMETER = "POLARITY ACTIVE_LOW" *) + input ap_rst_n, + + // Weight Stream + input [WEIGHT_STREAM_WIDTH_BA-1:0] weights_V_TDATA, + input weights_V_TVALID, + output weights_V_TREADY, + // Input Stream + input [INPUT_STREAM_WIDTH_BA-1:0] in0_V_TDATA, + input in0_V_TVALID, + output in0_V_TREADY, + // Output Stream + output [OUTPUT_STREAM_WIDTH_BA-1:0] out_V_TDATA, + output out_V_TVALID, + input out_V_TREADY +); + +mvu_vvu_axi #( + .IS_MVU(IS_MVU), .COMPUTE_CORE(COMPUTE_CORE), .MW(MW), .MH(MH), .PE(PE), .SIMD(SIMD), + .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), .ACCU_WIDTH(ACCU_WIDTH), + .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL) + ) inst ( + .ap_clk(ap_clk), + .ap_rst_n(ap_rst_n), + .s_axis_weights_tdata(weights_V_TDATA), + .s_axis_weights_tvalid(weights_V_TVALID), + .s_axis_weights_tready(weights_V_TREADY), + .s_axis_input_tdata(in0_V_TDATA), + .s_axis_input_tvalid(in0_V_TVALID), + .s_axis_input_tready(in0_V_TREADY), + .m_axis_output_tdata(out_V_TDATA), + .m_axis_output_tvalid(out_V_TVALID), + .m_axis_output_tready(out_V_TREADY) +); + +endmodule // $MODULE_NAME_AXI_WRAPPER$ From cf28d780041fec1effdf743e62390eebc5c81f98 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 21 Sep 2023 16:24:18 +0100 Subject: [PATCH 059/123] [mvu vvu tb]: modified testbench to also support testing VVU on DSP58 --- finn-rtllib/mvu/tb/mvu_vvu_axi_tb.sv | 222 +++++++++++++++++++++++++++ 1 file changed, 222 insertions(+) create mode 100644 finn-rtllib/mvu/tb/mvu_vvu_axi_tb.sv diff --git a/finn-rtllib/mvu/tb/mvu_vvu_axi_tb.sv b/finn-rtllib/mvu/tb/mvu_vvu_axi_tb.sv new file mode 100644 index 0000000000..82c2e8e7b0 --- /dev/null +++ b/finn-rtllib/mvu/tb/mvu_vvu_axi_tb.sv @@ -0,0 +1,222 @@ +/****************************************************************************** + * Copyright (C) 2022, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Testbench for MVU AXI-lite interface wrapper. + *****************************************************************************/ + +module mvu_vvu_axi_tb(); + +//-------------------- Simulation parameters --------------------\\ + // Matrix & parallelism config + localparam bit IS_MVU = 1; + localparam string COMPUTE_CORE = "mvu_vvu_8sx9_dsp58"; + localparam int unsigned MW = 1500; + localparam int unsigned MH = 256; + localparam int unsigned SIMD = 60; + localparam int unsigned PE = 16; + localparam int unsigned SEGMENTLEN = 2.0; + localparam bit FORCE_BEHAVIORAL = 1; + localparam bit M_REG_LUT = 1; + // Bit-width config + localparam int unsigned ACTIVATION_WIDTH = 4; + localparam int unsigned WEIGHT_WIDTH = 4; + localparam int unsigned ACCU_WIDTH = 21; // == ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW) + localparam bit SIGNED_ACTIVATIONS = 0; + // Simulation constants + localparam int unsigned NF = MH/PE; + localparam int unsigned SF = MW/SIMD; + localparam int unsigned WEIGHT_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8*8; + localparam int unsigned ACTIVATION_WIDTH_BA = ((IS_MVU ? 1 : PE)*SIMD*ACTIVATION_WIDTH+7)/8*8; + localparam int unsigned WEIGHT_WIDTH_BA_DELTA = WEIGHT_WIDTH_BA - PE*SIMD*WEIGHT_WIDTH; + localparam int unsigned ACTIVATION_WIDTH_BA_DELTA = ACTIVATION_WIDTH_BA - (IS_MVU ? 1 : PE)*SIMD*ACTIVATION_WIDTH; + localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8; + + // Generate clk and reset signal + logic clk = 0; + always #5ns clk = !clk; + + logic ap_rst_n = 0; + initial begin + repeat(16) @(posedge clk); + ap_rst_n <= 1; + end + + uwire ap_clk = clk; + + // Generate activations + typedef logic [(IS_MVU ? 1 : PE)*SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t; + typedef activation_t activation_vector_t[(IS_MVU ? 1 : NF)*SF]; + + function activation_vector_t init_ACTIVATIONS; + automatic activation_vector_t res; + std::randomize(res); + return res; + endfunction : init_ACTIVATIONS + + activation_vector_t ACTIVATIONS = init_ACTIVATIONS(); + + struct { + activation_t dat; + logic vld; + logic rdy; + } activations; + + initial begin + activations.vld = 0; + activations.dat = 'X; + @(posedge clk iff ap_rst_n); + + for (int j=0; j<(IS_MVU ? 1 : NF); j++) begin + for (int i=0; i= 0; + @(posedge clk); + end while (!(activations.vld === 1 && activations.rdy === 1)); + end + end + + activations.vld <= 0; + activations.dat <= 'x; + end + + // Generate weights + typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t; + typedef weight_t weight_matrix_t[NF][SF]; + + function weight_matrix_t init_WEIGHTS; + automatic weight_matrix_t res; + std::randomize(res); + return res; + endfunction : init_WEIGHTS; + + weight_matrix_t WEIGHTS = init_WEIGHTS(); + + struct { + weight_t dat; + logic vld; + logic rdy; + } weights; + + initial begin + weights.vld = 0; + weights.dat = 'X; + @(posedge clk iff ap_rst_n); + + weights.vld <= 1; + for (int i=0; i 1 ? $signed(a[j/PE*SF+i/SIMD][(i*PE + j%PE) % (SIMD*PE)]) : $signed(a[j/PE*SF+i/SIMD][i%SIMD]) ) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]); + else + res[j/PE][j%PE] = IS_MVU ? $signed(res[j/PE][j%PE]) + $signed({1'b0, a[i/SIMD][i%SIMD]}) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]) : + $signed(res[j/PE][j%PE]) + ( PE > 1 ? $signed({1'b0, a[j/PE*SF+i/SIMD][(i*PE + j%PE) % (SIMD*PE)]}) : $signed({1'b0, a[j/PE+SF+i/SIMD][i%SIMD]}) ) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]); + end + end + return res; + endfunction : check_output; + + output_vector_t GOLDEN_OUTPUT = check_output(ACTIVATIONS, WEIGHTS); + + int unsigned NF_CNT = 0; + initial begin + outputs.rdy = 0; + while (NF_CNT < NF) begin + // Loop until both rdy & vld are asserted + do begin + outputs.rdy <= $urandom()%7 >= 0; + @(posedge clk iff ap_rst_n); + end while (!(outputs.rdy === 1 && outputs.vld === 1)); + + // Compare produced outputs against golden outputs + foreach(outputs.dat[i]) begin + assert ($signed(outputs.dat[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); + else begin + $error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); + $stop; + end + end + + NF_CNT += 1; + end + + $finish; + end + + // Instantiate DUT + mvu_vvu_axi #( + .IS_MVU(IS_MVU), + .COMPUTE_CORE(COMPUTE_CORE), + .MW(MW), + .MH(MH), + .PE(PE), + .SIMD(SIMD), + .ACTIVATION_WIDTH(ACTIVATION_WIDTH), + .WEIGHT_WIDTH(WEIGHT_WIDTH), + .ACCU_WIDTH(ACCU_WIDTH), + .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), + .SEGMENTLEN(SEGMENTLEN), + .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL), + .M_REG_LUT(M_REG_LUT) + ) + dut ( + .ap_clk, .ap_rst_n, .s_axis_weights_tdata({ {WEIGHT_WIDTH_BA_DELTA{1'b0}}, weights.dat }), .s_axis_weights_tvalid(weights.vld), + .s_axis_weights_tready(weights.rdy), .s_axis_input_tdata({ {ACTIVATION_WIDTH_BA_DELTA{1'b0}}, activations.dat }), .s_axis_input_tvalid(activations.vld), + .s_axis_input_tready(activations.rdy), .m_axis_output_tdata(outputs.dat), .m_axis_output_tvalid(outputs.vld), + .m_axis_output_tready(outputs.rdy) + ); + +endmodule : mvu_vvu_axi_tb From 2617c391e1d2c9b19fb881acb6012fc56df35eae Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 21 Sep 2023 16:25:22 +0100 Subject: [PATCH 060/123] [axi wrapper]: minor modification to comment description --- finn-rtllib/mvu/mvu_vvu_axi_wrapper.v | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v index 6dbf82cb7b..788e49a71b 100644 --- a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v +++ b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v @@ -28,7 +28,7 @@ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * - * @brief Verilog AXI-lite wrapper for MVU. + * @brief Verilog AXI-lite wrapper for MVU & VVU. *****************************************************************************/ module $MODULE_NAME_AXI_WRAPPER$ #( From 8ca5fe73c003aec3e7998d83e233102c012dd531 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 21 Sep 2023 16:34:12 +0100 Subject: [PATCH 061/123] [mvu axi]: add support for VVU on DSP58 --- finn-rtllib/mvu/mvu_axi.sv | 105 ++++++++++++++++++++++++------------- 1 file changed, 69 insertions(+), 36 deletions(-) diff --git a/finn-rtllib/mvu/mvu_axi.sv b/finn-rtllib/mvu/mvu_axi.sv index 46167af95b..07ad32e6c8 100644 --- a/finn-rtllib/mvu/mvu_axi.sv +++ b/finn-rtllib/mvu/mvu_axi.sv @@ -28,19 +28,25 @@ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * - * @brief Matrix Vector Unit (MVU) AXI-lite interface wrapper. + * @brief Matrix Vector Unit (MVU) & Vector Vector Unit (VVU) AXI-lite interface wrapper. * @details + * The following compute cores are supported: + * - 4-bit MVU on DSP48 & DSP58 achieving 4 MACs/DSP, + * (4,8]-bit MVU on DSP48 achieving 2 MACs/DSP, + * [4,9]-bit MVU and VVU on DSP58 achieving 3 MACs/DSP, + * 'unconstrained' LUT-based MVU and VVU. * Folding hints: - * - 4-bit MVU: PE scaling should divide MH. - * - 8-bit MVU - DSP48: PE scaling should divide MH. - * - 8-bit MVU - DSP58: SIMD scaling should aim at a full multiple of 3 and divide MW. + * - PE scaling should divide MH. + * - SIMD scaling should divide MW. * - Otherwise, keep SIMD and PE somewhat balanced. SIMD scaling tends to * impact critical paths more than PE scaling. PE scaling implies a * bigger fanout on the input activations. * - Full unfolding along MH (PE=MH) results in no replay buffer instantiated *****************************************************************************/ -module mvu_axi #( +module mvu_vvu_axi #( + bit IS_MVU, // string type causes error in Vivado + parameter COMPUTE_CORE, int unsigned MW, int unsigned MH, int unsigned PE, @@ -51,16 +57,16 @@ module mvu_axi #( bit SIGNED_ACTIVATIONS = 0, int unsigned SEGMENTLEN = 0, bit FORCE_BEHAVIORAL = 0, - parameter MVU_IMPL_STYLE, // string type causes error in Vivado + bit M_REG_LUT = 1, + // Safely deducible parameters localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8, - localparam int unsigned INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8, + localparam int unsigned INPUT_STREAM_WIDTH_BA = ((IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8, localparam int unsigned WEIGHT_STREAM_WIDTH = PE*SIMD*WEIGHT_WIDTH, - localparam int unsigned INPUT_STREAM_WIDTH = SIMD*ACTIVATION_WIDTH, + localparam int unsigned INPUT_STREAM_WIDTH = (IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH, localparam int unsigned SF = MW/SIMD, localparam int unsigned NF = MH/PE, - localparam int unsigned OUTPUT_LANES = PE, - localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8 + localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8 ) ( // Global Control @@ -93,27 +99,31 @@ module mvu_axi #( $error("Matrix height (%0d) is not a multiple of PE (%0d).", MH, PE); $finish; end - if (ACTIVATION_WIDTH > 9) begin - $error("Activation width of %0d-bits exceeds maximum of 9-bits", ACTIVATION_WIDTH); - $finish; - end if (WEIGHT_WIDTH > 8) begin $error("Weight width of %0d-bits exceeds maximum of 8-bits", WEIGHT_WIDTH); $finish; end - if (SIGNED_ACTIVATIONS == 0 && ACTIVATION_WIDTH==9) begin - $error("Activation width of %0d-bits exceeds maximum of 8-bits for unsigned numbers", ACTIVATION_WIDTH); - $finish; + if (ACTIVATION_WIDTH > 8) begin + if (!(SIGNED_ACTIVATIONS == 1 && ACTIVATION_WIDTH == 9 && COMPUTE_CORE == "mvu_vvu_8sx9_dsp58")) begin + $error("Activation width of %0d-bits exceeds maximum of 9-bits for signed numbers on DSP48", ACTIVATION_WIDTH); + $finish; + end end - if (MVU_IMPL_STYLE == "mvu_8sx9") begin + if (COMPUTE_CORE == "mvu_vvu_8sx9_dsp58") begin if (SEGMENTLEN == 0) begin - $warning("Segment length of %0d defaults to chain length", SEGMENTLEN); + $warning("Segment length of %0d defaults to chain length of %0d", SEGMENTLEN, (SIMD+2)/3); end if (SEGMENTLEN > (SIMD+2)/3) begin $error("Segment length of %0d exceeds chain length of %0d", SEGMENTLEN, (SIMD+2)/3); $finish; end end + if (!IS_MVU) begin + if (COMPUTE_CORE != "mvu_vvu_8sx9_dsp58" && COMPUTE_CORE != "mvu_vvu_lut") begin + $error("VVU only supported on DSP58 or LUT-based implementation"); + $finish; + end + end end uwire clk = ap_clk; @@ -127,10 +137,10 @@ module mvu_axi #( uwire avld; uwire ardy; - replay_buffer #(.LEN(SF), .REP(NF), .W($bits(mvauin_t))) activation_replay ( - .clk, .rst, - .ivld(s_axis_input_tvalid), .irdy(s_axis_input_tready), .idat(mvauin_t'(s_axis_input_tdata)), - .ovld(avld), .ordy(ardy), .odat(amvau), .olast(alast), .ofin(afin) + replay_buffer #(.LEN(SF), .REP(IS_MVU ? NF : 1), .W($bits(mvauin_t))) activation_replay ( + .clk, .rst, + .ivld(s_axis_input_tvalid), .irdy(s_axis_input_tready), .idat(mvauin_t'(s_axis_input_tdata)), + .ovld(avld), .ordy(ardy), .odat(amvau), .olast(alast), .ofin(afin) ); //-------------------- Input control --------------------\\ @@ -139,37 +149,60 @@ module mvu_axi #( assign ardy = en && s_axis_weights_tvalid; assign s_axis_weights_tready = en && avld; -//-------------------- Core MVU --------------------\\ +//-------------------- Core MVU/VVU --------------------\\ uwire ovld; uwire [PE-1:0][ACCU_WIDTH-1:0] odat; typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t; - - case(MVU_IMPL_STYLE) - "mvu_8sx9_dsp58": - mvu_8sx9 #(.PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), + uwire mvauin_t amvau_i; + + if (IS_MVU) begin : genMVUInput + assign amvau_i = amvau; + end : genMVUInput + else begin : genVVUInput + // The input stream will have the channels interleaved for VVU when PE>1 + // Hence, we need to 'untangle' the input stream, i.e. [..][SIMD*PE][..] --> [..][PE][SIMD][..] + // Note that for each 'SIMD' (S) and 'PE' (P) element, we have something like: + // (S_0, P_0), ..., (S_0, P_i), (S_1, P_0), ..., (S_1, P_i), ..., (S_i, P_i) which we need to 'untangle' to + // (S_0, P_0), ..., (S_i, P_0), (S_0, P_1), ..., (S_i,, P_1), ..., (S_i, P_i) + localparam int num_of_elements = INPUT_STREAM_WIDTH/ACTIVATION_WIDTH; + for (genvar i=0; i 1) ? + amvau[(i/SIMD + (i*PE % num_of_elements) + 1) * ACTIVATION_WIDTH : (i/SIMD + (i*PE % num_of_elements)) * ACTIVATION_WIDTH] + : amvau[i*ACTIVATION_WIDTH +: ACTIVATION_WIDTH]; + end : genRewire + end : genVVUInput + + case(COMPUTE_CORE) + "mvu_vvu_8sx9_dsp58": + mvu_vvu_8sx9 #(.IS_MVU(IS_MVU), .PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( .clk, .rst, .en, - .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau), + .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i), .vld(ovld), .p(odat) ); - "mvu_4sx4u": mvu_4sx4u #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( .clk, .rst, .en, - .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau), + .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i), .vld(ovld), .p(odat) ); - "mvu_8sx8u_dsp48": mvu_8sx8u_dsp48 #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( .clk, .rst, .en, - .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau), + .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i), + .vld(ovld), .p(odat) + ); + "mvu_vvu_lut": + mvu_vvu_lut #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), + .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .M_REG(M_REG_LUT)) core ( + .clk, .rst, .en, + .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i), .vld(ovld), .p(odat) ); default: initial begin - $error("Unrecognized MVU_IMPL_STYLE '%s'", MVU_IMPL_STYLE); + $error("Unrecognized COMPUTE_CORE '%s'", COMPUTE_CORE); $finish; end endcase @@ -203,7 +236,7 @@ module mvu_axi #( assign b_load = !B.vld || m_axis_output_tready; always_ff @(posedge clk) begin - if(rst) B <= '{ default: 'x }; + if(rst) B <= '{ vld: 0, default: 'x }; else begin if(b_load) B <= '{ vld: A.vld, dat: A.dat}; end @@ -212,4 +245,4 @@ module mvu_axi #( assign m_axis_output_tvalid = B.vld; assign m_axis_output_tdata = B.dat; -endmodule : mvu_axi +endmodule : mvu_vvu_axi From 32d6338c626b26d2e48cdb21cde438d557cc9bcd Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 21 Sep 2023 16:34:36 +0100 Subject: [PATCH 062/123] [mvu vvu axi]: renamed file for consistency purposes --- finn-rtllib/mvu/mvu_vvu_axi.sv | 248 +++++++++++++++++++++++++++++++++ 1 file changed, 248 insertions(+) create mode 100644 finn-rtllib/mvu/mvu_vvu_axi.sv diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv new file mode 100644 index 0000000000..07ad32e6c8 --- /dev/null +++ b/finn-rtllib/mvu/mvu_vvu_axi.sv @@ -0,0 +1,248 @@ +/****************************************************************************** + * Copyright (C) 2022, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Matrix Vector Unit (MVU) & Vector Vector Unit (VVU) AXI-lite interface wrapper. + * @details + * The following compute cores are supported: + * - 4-bit MVU on DSP48 & DSP58 achieving 4 MACs/DSP, + * (4,8]-bit MVU on DSP48 achieving 2 MACs/DSP, + * [4,9]-bit MVU and VVU on DSP58 achieving 3 MACs/DSP, + * 'unconstrained' LUT-based MVU and VVU. + * Folding hints: + * - PE scaling should divide MH. + * - SIMD scaling should divide MW. + * - Otherwise, keep SIMD and PE somewhat balanced. SIMD scaling tends to + * impact critical paths more than PE scaling. PE scaling implies a + * bigger fanout on the input activations. + * - Full unfolding along MH (PE=MH) results in no replay buffer instantiated + *****************************************************************************/ + +module mvu_vvu_axi #( + bit IS_MVU, // string type causes error in Vivado + parameter COMPUTE_CORE, + int unsigned MW, + int unsigned MH, + int unsigned PE, + int unsigned SIMD, + int unsigned ACTIVATION_WIDTH, + int unsigned WEIGHT_WIDTH, + int unsigned ACCU_WIDTH, + bit SIGNED_ACTIVATIONS = 0, + int unsigned SEGMENTLEN = 0, + bit FORCE_BEHAVIORAL = 0, + bit M_REG_LUT = 1, + + // Safely deducible parameters + localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8, + localparam int unsigned INPUT_STREAM_WIDTH_BA = ((IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8, + localparam int unsigned WEIGHT_STREAM_WIDTH = PE*SIMD*WEIGHT_WIDTH, + localparam int unsigned INPUT_STREAM_WIDTH = (IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH, + localparam int unsigned SF = MW/SIMD, + localparam int unsigned NF = MH/PE, + localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8 +) +( + // Global Control + input logic ap_clk, + input logic ap_rst_n, + + // Weight Stream + input logic [WEIGHT_STREAM_WIDTH_BA-1:0] s_axis_weights_tdata, + input logic s_axis_weights_tvalid, + output logic s_axis_weights_tready, + + // Input Stream + input logic [INPUT_STREAM_WIDTH_BA-1:0] s_axis_input_tdata, + input logic s_axis_input_tvalid, + output logic s_axis_input_tready, + + // Output Stream + output logic [OUTPUT_STREAM_WIDTH_BA-1:0] m_axis_output_tdata, + output logic m_axis_output_tvalid, + input logic m_axis_output_tready +); + +//-------------------- Parameter sanity checks --------------------\\ + initial begin + if (MW % SIMD != 0) begin + $error("Matrix width (%0d) is not a multiple of SIMD (%0d).", MW, SIMD); + $finish; + end + if (MH % PE != 0) begin + $error("Matrix height (%0d) is not a multiple of PE (%0d).", MH, PE); + $finish; + end + if (WEIGHT_WIDTH > 8) begin + $error("Weight width of %0d-bits exceeds maximum of 8-bits", WEIGHT_WIDTH); + $finish; + end + if (ACTIVATION_WIDTH > 8) begin + if (!(SIGNED_ACTIVATIONS == 1 && ACTIVATION_WIDTH == 9 && COMPUTE_CORE == "mvu_vvu_8sx9_dsp58")) begin + $error("Activation width of %0d-bits exceeds maximum of 9-bits for signed numbers on DSP48", ACTIVATION_WIDTH); + $finish; + end + end + if (COMPUTE_CORE == "mvu_vvu_8sx9_dsp58") begin + if (SEGMENTLEN == 0) begin + $warning("Segment length of %0d defaults to chain length of %0d", SEGMENTLEN, (SIMD+2)/3); + end + if (SEGMENTLEN > (SIMD+2)/3) begin + $error("Segment length of %0d exceeds chain length of %0d", SEGMENTLEN, (SIMD+2)/3); + $finish; + end + end + if (!IS_MVU) begin + if (COMPUTE_CORE != "mvu_vvu_8sx9_dsp58" && COMPUTE_CORE != "mvu_vvu_lut") begin + $error("VVU only supported on DSP58 or LUT-based implementation"); + $finish; + end + end + end + + uwire clk = ap_clk; + uwire rst = !ap_rst_n; + + typedef logic [INPUT_STREAM_WIDTH-1 : 0] mvauin_t; + + uwire mvauin_t amvau; + uwire alast; + uwire afin; + uwire avld; + uwire ardy; + + replay_buffer #(.LEN(SF), .REP(IS_MVU ? NF : 1), .W($bits(mvauin_t))) activation_replay ( + .clk, .rst, + .ivld(s_axis_input_tvalid), .irdy(s_axis_input_tready), .idat(mvauin_t'(s_axis_input_tdata)), + .ovld(avld), .ordy(ardy), .odat(amvau), .olast(alast), .ofin(afin) + ); + +//-------------------- Input control --------------------\\ + uwire en; + uwire istb = avld && s_axis_weights_tvalid; + assign ardy = en && s_axis_weights_tvalid; + assign s_axis_weights_tready = en && avld; + +//-------------------- Core MVU/VVU --------------------\\ + uwire ovld; + uwire [PE-1:0][ACCU_WIDTH-1:0] odat; + typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t; + uwire mvauin_t amvau_i; + + if (IS_MVU) begin : genMVUInput + assign amvau_i = amvau; + end : genMVUInput + else begin : genVVUInput + // The input stream will have the channels interleaved for VVU when PE>1 + // Hence, we need to 'untangle' the input stream, i.e. [..][SIMD*PE][..] --> [..][PE][SIMD][..] + // Note that for each 'SIMD' (S) and 'PE' (P) element, we have something like: + // (S_0, P_0), ..., (S_0, P_i), (S_1, P_0), ..., (S_1, P_i), ..., (S_i, P_i) which we need to 'untangle' to + // (S_0, P_0), ..., (S_i, P_0), (S_0, P_1), ..., (S_i,, P_1), ..., (S_i, P_i) + localparam int num_of_elements = INPUT_STREAM_WIDTH/ACTIVATION_WIDTH; + for (genvar i=0; i 1) ? + amvau[(i/SIMD + (i*PE % num_of_elements) + 1) * ACTIVATION_WIDTH : (i/SIMD + (i*PE % num_of_elements)) * ACTIVATION_WIDTH] + : amvau[i*ACTIVATION_WIDTH +: ACTIVATION_WIDTH]; + end : genRewire + end : genVVUInput + + case(COMPUTE_CORE) + "mvu_vvu_8sx9_dsp58": + mvu_vvu_8sx9 #(.IS_MVU(IS_MVU), .PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), + .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN), + .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( + .clk, .rst, .en, + .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i), + .vld(ovld), .p(odat) + ); + "mvu_4sx4u": + mvu_4sx4u #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( + .clk, .rst, .en, + .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i), + .vld(ovld), .p(odat) + ); + "mvu_8sx8u_dsp48": + mvu_8sx8u_dsp48 #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), + .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( + .clk, .rst, .en, + .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i), + .vld(ovld), .p(odat) + ); + "mvu_vvu_lut": + mvu_vvu_lut #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), + .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .M_REG(M_REG_LUT)) core ( + .clk, .rst, .en, + .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i), + .vld(ovld), .p(odat) + ); + default: initial begin + $error("Unrecognized COMPUTE_CORE '%s'", COMPUTE_CORE); + $finish; + end + endcase + +//-------------------- Output register slice --------------------\\ + struct packed { + logic vld; + logic [PE-1:0][ACCU_WIDTH-1:0] dat; + } A = '{ vld: 0, default: 'x}; + + assign en = !A.vld || !ovld; + + uwire b_load; + always_ff @(posedge clk) begin + if(rst) A <= '{ vld: 0, default: 'x }; + else if(!A.vld || b_load) begin + A.vld <= ovld && en; + for(int unsigned i = 0; i < PE; i++) begin + // CR-1148862: + // A.dat[i] <= odat[i]; + automatic logic [ACCU_WIDTH-1:0] v = odat[i]; + A.dat[i] <= v[ACCU_WIDTH-1:0]; + end + end + end + + struct packed { + logic vld; + logic [PE-1:0][ACCU_WIDTH-1:0] dat; + } B = '{ vld: 0, default: 'x}; + + assign b_load = !B.vld || m_axis_output_tready; + always_ff @(posedge clk) begin + if(rst) B <= '{ vld: 0, default: 'x }; + else begin + if(b_load) B <= '{ vld: A.vld, dat: A.dat}; + end + end + + assign m_axis_output_tvalid = B.vld; + assign m_axis_output_tdata = B.dat; + +endmodule : mvu_vvu_axi From 031406d73fa36a02638a94affd6a0bef36956c3c Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 21 Sep 2023 16:39:22 +0100 Subject: [PATCH 063/123] [mvu 8sx9]: added support for VVU on DSP58, resolved PyVerilator-caused error and added synthesis directive to handle 'X in input data --- finn-rtllib/mvu/mvu_8sx9.sv | 100 +++++++++++++++++++----------------- 1 file changed, 52 insertions(+), 48 deletions(-) diff --git a/finn-rtllib/mvu/mvu_8sx9.sv b/finn-rtllib/mvu/mvu_8sx9.sv index 34aa856b1b..52a93739d6 100644 --- a/finn-rtllib/mvu/mvu_8sx9.sv +++ b/finn-rtllib/mvu/mvu_8sx9.sv @@ -31,7 +31,8 @@ * @brief Matrix Vector Unit (MVU) core compute kernel utilizing DSP58. *****************************************************************************/ -module mvu_8sx9 #( +module mvu_vvu_8sx9 #( + parameter IS_MVU, int unsigned PE, int unsigned SIMD, int unsigned ACTIVATION_WIDTH, @@ -39,7 +40,9 @@ module mvu_8sx9 #( int unsigned ACCU_WIDTH, bit SIGNED_ACTIVATIONS = 0, int unsigned SEGMENTLEN = 0, // Default to 0 (which implies a single segment) - bit FORCE_BEHAVIORAL = 0 + bit FORCE_BEHAVIORAL = 0, + + int unsigned ACTIVATION_ELEMENTS = (IS_MVU ? 1 : PE) * SIMD ) ( // Global Control @@ -51,7 +54,7 @@ module mvu_8sx9 #( input logic last, input logic zero, // ignore current inputs and force this partial product to zero input logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] w, // weights - input logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] a, // activations + input logic [ACTIVATION_ELEMENTS-1:0][ACTIVATION_WIDTH-1:0] a, // activations // Ouput output logic vld, @@ -67,9 +70,10 @@ module mvu_8sx9 #( //-------------------- Declare global signals --------------------\\ localparam int unsigned CHAINLEN = (SIMD+2)/3; localparam int unsigned SEGLEN = SEGMENTLEN == 0 ? CHAINLEN : SEGMENTLEN; // Additional constant to default a SEGMENTLEN of '0' to the DSP-chain length - uwire [26:0] a_in_i [CHAINLEN]; + localparam int unsigned PE_ACTIVATION = IS_MVU ? 1 : PE; + uwire [26:0] a_in_i [PE_ACTIVATION * CHAINLEN]; uwire [23:0] b_in_i [PE][CHAINLEN]; - uwire [57:0] pcout [PE][CHAINLEN]; + uwire [PE-1:0][CHAINLEN-1:0][57:0] pcout; // Array with packed dimension > 256 (with a loop-carried dependency) cannot be handled out-of-the-box with PyVerilator //-------------------- Shift register for opmode select signal --------------------\\ localparam int unsigned MAX_PIPELINE_STAGES = (CHAINLEN + SEGLEN-1)/SEGLEN; // >=1 (== number of pipeline registers + 1 (A/B inputs always have 1 register)) @@ -99,48 +103,48 @@ module mvu_8sx9 #( //-------------------- Buffer for input activations --------------------\\ localparam int unsigned PAD_BITS_ACT = 9 - ACTIVATION_WIDTH; + for (genvar k=0; k1 ? TOTAL_PREGS-1 : 0; + localparam int LANES_OCCUPIED = i == CHAINLEN-1 ? SIMD - 3*i : 3; - for (genvar i=0; i1 ? TOTAL_PREGS-1 : 0; - localparam int LANES_OCCUPIED = i == CHAINLEN-1 ? SIMD - 3*i : 3; - - if (EXTERNAL_PREGS > 0) begin : genExternalPregAct - logic [0:EXTERNAL_PREGS-1][LANES_OCCUPIED-1:0][ACTIVATION_WIDTH-1:0] A = '{ default : 0}; - always_ff @(posedge clk) begin - if (rst) A <= '{default: 0}; - else if(en) begin - A[EXTERNAL_PREGS-1] <= -// synthesis translate_off - zero ? '1 : -// synthesis translate_on - a[3*i +: LANES_OCCUPIED]; - if (EXTERNAL_PREGS > 1) A[0:EXTERNAL_PREGS-2] <= A[1:EXTERNAL_PREGS-1]; + if (EXTERNAL_PREGS > 0) begin : genExternalPregAct + logic [0:EXTERNAL_PREGS-1][LANES_OCCUPIED-1:0][ACTIVATION_WIDTH-1:0] A = '{ default : 0}; + always_ff @(posedge clk) begin + if (rst) A <= '{default: 0}; + else if(en) begin + A[EXTERNAL_PREGS-1] <= + // synthesis translate_off + zero ? '1 : + // synthesis translate_on + a[SIMD*k + 3*i +: LANES_OCCUPIED]; + if (EXTERNAL_PREGS > 1) A[0:EXTERNAL_PREGS-2] <= A[1:EXTERNAL_PREGS-1]; + end end - end - for (genvar j=0; j Date: Thu, 21 Sep 2023 16:39:52 +0100 Subject: [PATCH 064/123] [mvu vvu 8sx9]: renamed compute core for consistency --- finn-rtllib/mvu/mvu_vvu_8sx9.sv | 427 ++++++++++++++++++++++++++++++++ 1 file changed, 427 insertions(+) create mode 100644 finn-rtllib/mvu/mvu_vvu_8sx9.sv diff --git a/finn-rtllib/mvu/mvu_vvu_8sx9.sv b/finn-rtllib/mvu/mvu_vvu_8sx9.sv new file mode 100644 index 0000000000..52a93739d6 --- /dev/null +++ b/finn-rtllib/mvu/mvu_vvu_8sx9.sv @@ -0,0 +1,427 @@ +/****************************************************************************** + * Copyright (C) 2022, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Matrix Vector Unit (MVU) core compute kernel utilizing DSP58. + *****************************************************************************/ + +module mvu_vvu_8sx9 #( + parameter IS_MVU, + int unsigned PE, + int unsigned SIMD, + int unsigned ACTIVATION_WIDTH, + int unsigned WEIGHT_WIDTH, + int unsigned ACCU_WIDTH, + bit SIGNED_ACTIVATIONS = 0, + int unsigned SEGMENTLEN = 0, // Default to 0 (which implies a single segment) + bit FORCE_BEHAVIORAL = 0, + + int unsigned ACTIVATION_ELEMENTS = (IS_MVU ? 1 : PE) * SIMD + ) + ( + // Global Control + input logic clk, + input logic rst, + input logic en, + + // Input + input logic last, + input logic zero, // ignore current inputs and force this partial product to zero + input logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] w, // weights + input logic [ACTIVATION_ELEMENTS-1:0][ACTIVATION_WIDTH-1:0] a, // activations + + // Ouput + output logic vld, + output logic [PE-1:0][ACCU_WIDTH-1:0] p + ); + // for verilator always use behavioral code + localparam bit BEHAVIORAL = +`ifdef VERILATOR + 1 || +`endif + FORCE_BEHAVIORAL; + +//-------------------- Declare global signals --------------------\\ + localparam int unsigned CHAINLEN = (SIMD+2)/3; + localparam int unsigned SEGLEN = SEGMENTLEN == 0 ? CHAINLEN : SEGMENTLEN; // Additional constant to default a SEGMENTLEN of '0' to the DSP-chain length + localparam int unsigned PE_ACTIVATION = IS_MVU ? 1 : PE; + uwire [26:0] a_in_i [PE_ACTIVATION * CHAINLEN]; + uwire [23:0] b_in_i [PE][CHAINLEN]; + uwire [PE-1:0][CHAINLEN-1:0][57:0] pcout; // Array with packed dimension > 256 (with a loop-carried dependency) cannot be handled out-of-the-box with PyVerilator + +//-------------------- Shift register for opmode select signal --------------------\\ + localparam int unsigned MAX_PIPELINE_STAGES = (CHAINLEN + SEGLEN-1)/SEGLEN; // >=1 (== number of pipeline registers + 1 (A/B inputs always have 1 register)) + logic L [0:1+MAX_PIPELINE_STAGES] = '{default: 0}; // After MAX_PIPELINE_STAGES (== number of pipeline stages for input data), we have 3 additional cycles latency (A/B reg, Mreg, Preg). Thus, we add +2 (since OPMODE is buffered by 1 cycle in the DSP fabric) + + always_ff @(posedge clk) begin + if(rst) L <= '{default: 0}; + else if(en) begin + L[1+MAX_PIPELINE_STAGES] <= last; + L[0:MAX_PIPELINE_STAGES] <= L[1:1+MAX_PIPELINE_STAGES]; + end + end + assign vld = L[0]; + +//-------------------- Shift register for ZERO flag --------------------\\ + logic Z [0:MAX_PIPELINE_STAGES-2] = '{default:0}; // We need MAX_PIPELINE_STAGES-1 pipeline stages (note: INMODE is buffered inside DSP fabric) + + if (MAX_PIPELINE_STAGES > 1) begin : genZreg + always_ff @(posedge clk) begin + if (rst) Z <= '{default: 0}; + else if(en) begin + Z[0] <= zero; + if (MAX_PIPELINE_STAGES > 2) Z[1:MAX_PIPELINE_STAGES-2] <= Z[0:MAX_PIPELINE_STAGES-3]; + end + end + end; + +//-------------------- Buffer for input activations --------------------\\ + localparam int unsigned PAD_BITS_ACT = 9 - ACTIVATION_WIDTH; + for (genvar k=0; k1 ? TOTAL_PREGS-1 : 0; + localparam int LANES_OCCUPIED = i == CHAINLEN-1 ? SIMD - 3*i : 3; + + if (EXTERNAL_PREGS > 0) begin : genExternalPregAct + logic [0:EXTERNAL_PREGS-1][LANES_OCCUPIED-1:0][ACTIVATION_WIDTH-1:0] A = '{ default : 0}; + always_ff @(posedge clk) begin + if (rst) A <= '{default: 0}; + else if(en) begin + A[EXTERNAL_PREGS-1] <= + // synthesis translate_off + zero ? '1 : + // synthesis translate_on + a[SIMD*k + 3*i +: LANES_OCCUPIED]; + if (EXTERNAL_PREGS > 1) A[0:EXTERNAL_PREGS-2] <= A[1:EXTERNAL_PREGS-1]; + end + end + for (genvar j=0; j1 ? TOTAL_PREGS-1 : 0; + localparam int LANES_OCCUPIED = j == CHAINLEN-1 ? SIMD - 3*j : 3; + + if (EXTERNAL_PREGS > 0) begin : genExternalPregWeight + logic [0:PE-1][0:EXTERNAL_PREGS-1][LANES_OCCUPIED-1:0][WEIGHT_WIDTH-1:0] B = '{ default : 0}; + always_ff @(posedge clk) begin + if (rst) B <= '{default: 0}; + else if (en) begin + B[i][EXTERNAL_PREGS-1] <= +// synthesis translate_off + zero ? '1 : +// synthesis translate_on + w[i][3*j +: LANES_OCCUPIED]; + if (EXTERNAL_PREGS > 1) B[i][0:EXTERNAL_PREGS-2] <= B[i][1:EXTERNAL_PREGS-1]; + end + end + for (genvar k = 0 ; k < LANES_OCCUPIED ; k++) begin : genBin + assign b_in_i[i][j][8*k +: 8] = PAD_BITS_WEIGHT == 0 ? B[i][0][k] : { {PAD_BITS_WEIGHT{B[i][0][k][WEIGHT_WIDTH-1]}}, B[i][0][k] }; + end : genBin + for (genvar k=LANES_OCCUPIED; k<3; k++) begin : genBinZero + assign b_in_i[i][j][8*k +: 8] = 8'b0; + end : genBinZero + end : genExternalPregWeight + else begin : genInpDSPWeight + for (genvar k = 0; k < LANES_OCCUPIED; k++) begin : genBin + assign b_in_i[i][j][8*k +: 8] = +// synthesis translate_off + zero ? '1 : +// synthesis translate_on + PAD_BITS_WEIGHT == 0 ? w[i][3*j+k] : { {PAD_BITS_WEIGHT{w[i][3*j+k][WEIGHT_WIDTH-1]}}, w[i][3*j+k] }; + end : genBin + for (genvar k=LANES_OCCUPIED; k<3; k++) begin : genBinZero + assign b_in_i[i][j][8*k +: 8] = 8'b0; + end : genBinZero + end : genInpDSPWeight + end : genWeightSIMD + end : genWeightPE + +//-------------------- Instantiate PE x CHAINLEN DSPs --------------------\\ + for (genvar i=0; i0 ? 2 : 1; // 1 : 0 + localparam bit PREG = (j+1)%SEGLEN==0 || j == CHAINLEN-1; + localparam bit FIRST = j == 0; + localparam bit LAST = j == CHAINLEN-1; + uwire [57:0] pp; + + if (LAST) begin : genPOUT + assign p[i] = pp[ACCU_WIDTH-1:0]; + end + + // Note: Since the product B * AD is computed, + // rst can be only applied to AD and zero only to B + // with the same effect as zeroing both. + if(BEHAVIORAL) begin : genBehav + // Stage #1: Input A/B + logic signed [33:0] Areg [INTERNAL_PREGS]; + always_ff @(posedge clk) begin + if (rst) Areg <= '{ default : 0}; + else if (en) begin + Areg[0] <= { 7'bx, a_in_i[(IS_MVU ? 0 : CHAINLEN*i) + j] }; + if (INTERNAL_PREGS == 2) Areg[1] <= Areg[0]; + end + end + logic signed [23:0] Breg [INTERNAL_PREGS]; + always_ff @(posedge clk) begin + if (rst) Breg <= '{ default : 0}; + else if (en) begin + Breg[0] <= b_in_i[i][j]; + if (INTERNAL_PREGS == 2) Breg[1] <= Breg[0]; + end + end + + // Stage #2: Multiply-Accumulate + logic signed [57:0] Mreg; + logic InmodeZero = 0; + always_ff @(posedge clk) begin + if (rst) InmodeZero <= 0; + else if (en) InmodeZero <= ( TOTAL_PREGS > 0 ? Z[TOTAL_PREGS-1] : zero ); + end + always_ff @(posedge clk) begin + if (rst) Mreg <= 0; + else if (en) begin + automatic logic signed [57:0] m = 0; + for (int k = 0; k < 3; k++) begin + m = m + (InmodeZero ? 0 : $signed(Areg[INTERNAL_PREGS-1][9*k +: 9]) * $signed(Breg[INTERNAL_PREGS-1][8*k +: 8])); + end + Mreg <= m; + end + end + + // Stage #3: Accumulate + logic signed [57:0] Preg; + logic Opmode = 0; + if (FIRST && !LAST) begin : genFirst + if (PREG) begin : genPregBehav + always_ff @(posedge clk) begin + if (rst) Preg <= 0; + else if (en) Preg <= Mreg; + end + end + else assign Preg = Mreg; + end + else if (FIRST && LAST) begin : genSingle + always_ff @(posedge clk) begin + if (rst) Opmode <= 0; + else if (en) Opmode <= L[1]; + end + always_ff @(posedge clk) begin + if (rst) Preg <= 0; + else if (en) Preg <= (Opmode ? 0 : Preg) + Mreg; + end + end + else if (!FIRST && LAST) begin : genLast + always_ff @(posedge clk) begin + if (rst) Opmode <= 0; + else if (en) Opmode <= L[1]; + end + always_ff @(posedge clk) begin + if (rst) Preg <= 0; + else if (en) Preg <= (Opmode ? 0 : Preg) + Mreg + pcout[i][j-1]; + end + end + else begin : genMid + if (PREG) begin : genPregBehav + always_ff @(posedge clk) begin + if (rst) Preg <= 0; + else if (en) Preg <= Mreg + pcout[i][j-1]; + end + end + else assign Preg = Mreg + pcout[i][j-1]; + end + assign pp = Preg; + assign pcout[i][j] = Preg; + end : genBehav +`ifndef VERILATOR + else begin: genDSP + DSP58 #( + // Feature Control Attributes: Data Path Selection + .AMULTSEL("A"), // Selects A input to multiplier (A, AD) + .A_INPUT("DIRECT"), // Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port) + .BMULTSEL("B"), // Selects B input to multiplier (AD, B) + .B_INPUT("DIRECT"), // Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port) + .DSP_MODE("INT8"), // Configures DSP to a particular mode of operation. Set to INT24 for + // legacy mode. + .PREADDINSEL("A"), // Selects input to pre-adder (A, B) + .RND(58'h000000000000000), // Rounding Constant + .USE_MULT("MULTIPLY"), // Select multiplier usage (DYNAMIC, MULTIPLY, NONE) + .USE_SIMD("ONE58"), // SIMD selection (FOUR12, ONE58, TWO24) + .USE_WIDEXOR("FALSE"), // Use the Wide XOR function (FALSE, TRUE) + .XORSIMD("XOR24_34_58_116"), // Mode of operation for the Wide XOR (XOR12_22, XOR24_34_58_116) + // Pattern Detector Attributes: Pattern Detection Configuration + .AUTORESET_PATDET("NO_RESET"), // NO_RESET, RESET_MATCH, RESET_NOT_MATCH + .AUTORESET_PRIORITY("RESET"), // Priority of AUTORESET vs. CEP (CEP, RESET). + .MASK(58'h0ffffffffffffff), // 58-bit mask value for pattern detect (1=ignore) + .PATTERN(58'h000000000000000), // 58-bit pattern match for pattern detect + .SEL_MASK("MASK"), // C, MASK, ROUNDING_MODE1, ROUNDING_MODE2 + .SEL_PATTERN("PATTERN"), // Select pattern value (C, PATTERN) + .USE_PATTERN_DETECT("NO_PATDET"), // Enable pattern detect (NO_PATDET, PATDET) + // Programmable Inversion Attributes: Specifies built-in programmable inversion on specific pins + .IS_ALUMODE_INVERTED(4'b0000), // Optional inversion for ALUMODE + .IS_CARRYIN_INVERTED(1'b0), // Optional inversion for CARRYIN + .IS_CLK_INVERTED(1'b0), // Optional inversion for CLK + .IS_INMODE_INVERTED(5'b00000), // Optional inversion for INMODE + .IS_NEGATE_INVERTED(3'b000), // Optional inversion for NEGATE + .IS_OPMODE_INVERTED({ LAST ? 2'b01 : 2'b00 , // W: LAST ? (L[1] ? 0 : P) : 0 + FIRST ? 3'b000 : 3'b001, // Z: FIRST ? 0 : PCIN + 2'b01, // Y : M + 2'b01 // X: M + }), // Optional inversion for OPMODE + .IS_RSTALLCARRYIN_INVERTED(1'b0), // Optional inversion for RSTALLCARRYIN + .IS_RSTALUMODE_INVERTED(1'b0), // Optional inversion for RSTALUMODE + .IS_RSTA_INVERTED(1'b0), // Optional inversion for RSTA + .IS_RSTB_INVERTED(1'b0), // Optional inversion for RSTB + .IS_RSTCTRL_INVERTED(1'b0), // Optional inversion for STCONJUGATE_A + .IS_RSTC_INVERTED(1'b0), // Optional inversion for RSTC + .IS_RSTD_INVERTED(1'b0), // Optional inversion for RSTD + .IS_RSTINMODE_INVERTED(1'b0), // Optional inversion for RSTINMODE + .IS_RSTM_INVERTED(1'b0), // Optional inversion for RSTM + .IS_RSTP_INVERTED(1'b0), // Optional inversion for RSTP + // Register Control Attributes: Pipeline Register Configuration + .ACASCREG(INTERNAL_PREGS), // Number of pipeline stages between A/ACIN and ACOUT (0-2) + .ADREG(0), // Pipeline stages for pre-adder (0-1) + .ALUMODEREG(0), // Pipeline stages for ALUMODE (0-1) + .AREG(INTERNAL_PREGS), // Pipeline stages for A (0-2) + .BCASCREG(INTERNAL_PREGS), // Number of pipeline stages between B/BCIN and BCOUT (0-2) + .BREG(INTERNAL_PREGS), // Pipeline stages for B (0-2) + .CARRYINREG(0), // Pipeline stages for CARRYIN (0-1) + .CARRYINSELREG(0), // Pipeline stages for CARRYINSEL (0-1) + .CREG(0), // Pipeline stages for C (0-1) + .DREG(0), // Pipeline stages for D (0-1) + .INMODEREG(1), // Pipeline stages for INMODE (0-1) + .MREG(1), // Multiplier pipeline stages (0-1) + .OPMODEREG(1), // Pipeline stages for OPMODE (0-1) + .PREG(PREG), // Number of pipeline stages for P (0-1) + .RESET_MODE("SYNC") // Selection of synchronous or asynchronous reset. (ASYNC, SYNC). + ) + DSP58_inst ( + // Cascade outputs: Cascade Ports + .ACOUT(), // 34-bit output: A port cascade + .BCOUT(), // 24-bit output: B cascade + .CARRYCASCOUT(), // 1-bit output: Cascade carry + .MULTSIGNOUT(), // 1-bit output: Multiplier sign cascade + .PCOUT(pcout[i][j]), // 58-bit output: Cascade output + // Control outputs: Control Inputs/Status Bits + .OVERFLOW(), // 1-bit output: Overflow in add/acc + .PATTERNBDETECT(), // 1-bit output: Pattern bar detect + .PATTERNDETECT(), // 1-bit output: Pattern detect + .UNDERFLOW(), // 1-bit output: Underflow in add/acc + // Data outputs: Data Ports + .CARRYOUT(), // 4-bit output: Carry + .P(pp), // 58-bit output: Primary data + .XOROUT(), // 8-bit output: XOR data + // Cascade inputs: Cascade Ports + .ACIN('x), // 34-bit input: A cascade data + .BCIN('x), // 24-bit input: B cascade + .CARRYCASCIN('x), // 1-bit input: Cascade carry + .MULTSIGNIN('x), // 1-bit input: Multiplier sign cascade + .PCIN(FIRST ? 'x : pcout[i][j-1]), // 58-bit input: P cascade + // Control inputs: Control Inputs/Status Bits + .ALUMODE(4'h0), // 4-bit input: ALU control + .CARRYINSEL('0), // 3-bit input: Carry select + .CLK(clk), // 1-bit input: Clock + .INMODE({ + INTERNAL_PREGS==2 ? 1'b0 : 1'b1, + 2'b00, + TOTAL_PREGS > 0 ? Z[TOTAL_PREGS-1] : zero, + INTERNAL_PREGS==2 ? 1'b0 : 1'b1 + }), // 5-bit input: INMODE control + .NEGATE('0), // 3-bit input: Negates the input of the multiplier + .OPMODE({ + LAST ? {1'b0, L[1]} : 2'b00, + 7'b000_0000 + }), // 9-bit input: Operation mode + // Data inputs: Data Ports + .A({ 7'bx, a_in_i[(IS_MVU ? 0 : CHAINLEN*i) + j] }), // 34-bit input: A data + .B(b_in_i[i][j]), // 24-bit input: B data + .C('x), // 58-bit input: C data + .CARRYIN('0), // 1-bit input: Carry-in + .D('x), // 27-bit input: D data + // Reset/Clock Enable inputs: Reset/Clock Enable Inputs + .ASYNC_RST('0), // 1-bit input: Asynchronous reset for all registers. + .CEA1(en), // 1-bit input: Clock enable for 1st stage AREG + .CEA2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage AREG + .CEAD('0), // 1-bit input: Clock enable for ADREG + .CEALUMODE('0), // 1-bit input: Clock enable for ALUMODE + .CEB1(en), // 1-bit input: Clock enable for 1st stage BREG + .CEB2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage BREG + .CEC('0), // 1-bit input: Clock enable for CREG + .CECARRYIN('0), // 1-bit input: Clock enable for CARRYINREG + .CECTRL(en), // 1-bit input: Clock enable for OPMODEREG and CARRYINSELREG + .CED('0), // 1-bit input: Clock enable for DREG + .CEINMODE(en), // 1-bit input: Clock enable for INMODEREG + .CEM(en), // 1-bit input: Clock enable for MREG + .CEP(PREG && en), // 1-bit input: Clock enable for PREG + .RSTA(rst), // 1-bit input: Reset for AREG + .RSTALLCARRYIN('0), // 1-bit input: Reset for CARRYINREG + .RSTALUMODE('0), // 1-bit input: Reset for ALUMODEREG + .RSTB(rst), // 1-bit input: Reset for BREG + .RSTC('0), // 1-bit input: Reset for CREG + .RSTCTRL(rst), // 1-bit input: Reset for OPMODEREG and CARRYINSELREG + .RSTD('0), // 1-bit input: Reset for DREG and ADREG + .RSTINMODE(rst), // 1-bit input: Reset for INMODE register + .RSTM(rst), // 1-bit input: Reset for MREG + .RSTP(PREG && rst) // 1-bit input: Reset for PREG + ); + end : genDSP +`endif + end : genDSPChain + end : genDSPPE + +endmodule : mvu_vvu_8sx9 From adb58694be36bd0fa2e8558f760d1642f14a2a38 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 21 Sep 2023 16:58:20 +0100 Subject: [PATCH 065/123] [axi wrapper]: changed parameter to localparam --- finn-rtllib/mvu/mvu_vvu_axi_wrapper.v | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v index 788e49a71b..270fe7351f 100644 --- a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v +++ b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v @@ -46,9 +46,9 @@ module $MODULE_NAME_AXI_WRAPPER$ #( parameter FORCE_BEHAVIORAL = $FORCE_BEHAVIORAL$, // Safely deducible parameters - parameter WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8, - parameter INPUT_STREAM_WIDTH_BA = ((IS_MVU == 1 ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8, - parameter OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8 + localparam WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8, + localparam INPUT_STREAM_WIDTH_BA = ((IS_MVU == 1 ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8, + localparam OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8 )( // Global Control (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF weights_V:in0_V:out_V, ASSOCIATED_RESET ap_rst_n" *) From f54d438f78fe4ce78c84fdd7bcbc514048bd2fe0 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 21 Sep 2023 16:59:32 +0100 Subject: [PATCH 066/123] [axi]: added support for LUT-based VVU --- finn-rtllib/mvu/mvu_vvu_axi.sv | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv index 07ad32e6c8..ff677fc244 100644 --- a/finn-rtllib/mvu/mvu_vvu_axi.sv +++ b/finn-rtllib/mvu/mvu_vvu_axi.sv @@ -195,8 +195,8 @@ module mvu_vvu_axi #( .vld(ovld), .p(odat) ); "mvu_vvu_lut": - mvu_vvu_lut #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), - .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .M_REG(M_REG_LUT)) core ( + mvu_vvu_lut #(.IS_MVU(IS_MVU), .PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), + .WEIGHT_WIDTH(WEIGHT_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .M_REG(M_REG_LUT)) core ( .clk, .rst, .en, .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i), .vld(ovld), .p(odat) From a4e2ac7146afeab4271344785f638c88cf78da73 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 21 Sep 2023 17:00:07 +0100 Subject: [PATCH 067/123] [mvu vvu 8sx9]: minor change to list of generics --- finn-rtllib/mvu/mvu_vvu_8sx9.sv | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/finn-rtllib/mvu/mvu_vvu_8sx9.sv b/finn-rtllib/mvu/mvu_vvu_8sx9.sv index 52a93739d6..2aa9d71b6c 100644 --- a/finn-rtllib/mvu/mvu_vvu_8sx9.sv +++ b/finn-rtllib/mvu/mvu_vvu_8sx9.sv @@ -32,7 +32,7 @@ *****************************************************************************/ module mvu_vvu_8sx9 #( - parameter IS_MVU, + bit IS_MVU, int unsigned PE, int unsigned SIMD, int unsigned ACTIVATION_WIDTH, @@ -42,7 +42,7 @@ module mvu_vvu_8sx9 #( int unsigned SEGMENTLEN = 0, // Default to 0 (which implies a single segment) bit FORCE_BEHAVIORAL = 0, - int unsigned ACTIVATION_ELEMENTS = (IS_MVU ? 1 : PE) * SIMD + localparam int unsigned ACTIVATION_ELEMENTS = (IS_MVU ? 1 : PE) * SIMD ) ( // Global Control From 40ad0b46c03b10b47ec4d72dd04a4ad96149fa89 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 21 Sep 2023 17:00:51 +0100 Subject: [PATCH 068/123] [mvu lut]: added support for VVU --- finn-rtllib/mvu/mvu_lut.sv | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/finn-rtllib/mvu/mvu_lut.sv b/finn-rtllib/mvu/mvu_lut.sv index b100a589e8..c100910d75 100644 --- a/finn-rtllib/mvu/mvu_lut.sv +++ b/finn-rtllib/mvu/mvu_lut.sv @@ -1,13 +1,15 @@ -module mvu_lut #( - int unsigned PE, - int unsigned SIMD, +module mvu_vvu_lut #( + bit IS_MVU, + int unsigned PE, + int unsigned SIMD, int unsigned ACCU_WIDTH, int unsigned ACTIVATION_WIDTH, int unsigned WEIGHT_WIDTH, bit SIGNED_ACTIVATIONS, bit M_REG = 1, - localparam unsigned MULT_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH + localparam int unsigned MULT_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH, + localparam int unsigned ACTIVATION_ELEMENTS = (IS_MVU ? 1 : PE) * SIMD )( // Global Control input logic clk, @@ -17,8 +19,8 @@ module mvu_lut #( // Input input logic last, input logic zero, // ignore current inputs and force this partial product to zero - input logic signed [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] w, // signed weights - input logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] a, // (un)signed activations + input logic signed [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] w, // signed weights + input logic [ACTIVATION_ELEMENTS-1:0][ACTIVATION_WIDTH-1:0] a, // (un)signed activations // Ouput output logic vld, @@ -63,16 +65,16 @@ module mvu_lut #( always_ff @(posedge clk) begin if(rst) M[j] = '{ default : 0 }; else if (en) M[j] = zero ? 0 : - SIGNED_ACTIVATIONS ? $signed(a[j]) * $signed(w[i][j]) : - $signed({1'b0, a[j]}) * $signed(w[i][j]); - // (SIGNED_ACTIVATIONS ? $signed(a[j]) : a[j]) * $signed(w[i][j]) isn't valid -- leads to unsigned multiplication + SIGNED_ACTIVATIONS ? $signed(a[(IS_MVU ? 0 : SIMD*i) + j]) * $signed(w[i][j]) : + $signed({1'b0, a[(IS_MVU ? 0 : SIMD*i) + j]}) * $signed(w[i][j]); + // (SIGNED_ACTIVATIONS ? $signed(a[(IS_MVU ? 0 : SIMD*i) + j]) : a[(IS_MVU ? 0 : SIMD*i) + j]) * $signed(w[i][j]) isn't valid -- leads to unsigned multiplication end assign m1[j] = M[j]; end : genMreg else begin : genNoMreg assign m1[j] = zero ? 0 : - SIGNED_ACTIVATIONS ? $signed(a[j]) * $signed(w[i][j]) : - $signed({1'b0, a[j]}) * $signed(w[i][j]); + SIGNED_ACTIVATIONS ? $signed(a[(IS_MVU ? 0 : SIMD*i) + j]) * $signed(w[i][j]) : + $signed({1'b0, a[(IS_MVU ? 0 : SIMD*i) + j]}) * $signed(w[i][j]); end : genNoMreg end : genSIMD @@ -99,4 +101,4 @@ module mvu_lut #( assign p[i] = P2[i]; end : genPE -endmodule : mvu_lut +endmodule : mvu_vvu_lut From 30fcb5b734f86d0032549a4efe29d96b13ee5451 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 21 Sep 2023 17:01:10 +0100 Subject: [PATCH 069/123] [mvu vvu lut]: renamed file for consistency --- finn-rtllib/mvu/mvu_vvu_lut.sv | 104 +++++++++++++++++++++++++++++++++ 1 file changed, 104 insertions(+) create mode 100644 finn-rtllib/mvu/mvu_vvu_lut.sv diff --git a/finn-rtllib/mvu/mvu_vvu_lut.sv b/finn-rtllib/mvu/mvu_vvu_lut.sv new file mode 100644 index 0000000000..c100910d75 --- /dev/null +++ b/finn-rtllib/mvu/mvu_vvu_lut.sv @@ -0,0 +1,104 @@ +module mvu_vvu_lut #( + bit IS_MVU, + int unsigned PE, + int unsigned SIMD, + int unsigned ACCU_WIDTH, + int unsigned ACTIVATION_WIDTH, + int unsigned WEIGHT_WIDTH, + bit SIGNED_ACTIVATIONS, + bit M_REG = 1, + + localparam int unsigned MULT_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH, + localparam int unsigned ACTIVATION_ELEMENTS = (IS_MVU ? 1 : PE) * SIMD +)( + // Global Control + input logic clk, + input logic rst, + input logic en, + + // Input + input logic last, + input logic zero, // ignore current inputs and force this partial product to zero + input logic signed [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] w, // signed weights + input logic [ACTIVATION_ELEMENTS-1:0][ACTIVATION_WIDTH-1:0] a, // (un)signed activations + + // Ouput + output logic vld, + output logic signed [PE-1:0][ACCU_WIDTH-1:0] p +); + + typedef int unsigned leave_load_t[2*SIMD-1]; + function leave_load_t init_leave_loads(); + automatic leave_load_t res; + for(int i = 2*(SIMD-1); i >= int'(SIMD)-1; i--) res[i] = 1; + for(int i = SIMD-2; i >= 0; i--) res[i] = res[2*i+1] + res[2*i+2]; + return res; + endfunction : init_leave_loads + + // Pipeline for last indicator flag + uwire last_i; + generate if (M_REG) begin + logic [0:1] L = '0; + always_ff @(posedge clk) begin + if(rst) L <= '0; + else if (en) L <= {last, L[0]}; + end + assign last_i = L[1]; + end + else begin + logic L = '0; + always_ff @(posedge clk) begin + if(rst) L <= '0; + else if (en) L <= last; + end + assign last_i = L; + end + endgenerate + + // For each PE generate + for (genvar i = 0; i < PE; i++) begin : genPE + // Stage #1: SIMD multipliers in parallel + uwire [MULT_WIDTH-1 : 0] m1 [SIMD]; + for (genvar j = 0; j < SIMD; j++) begin : genSIMD + if (M_REG) begin : genMreg + logic [MULT_WIDTH-1 : 0] M [SIMD]; + always_ff @(posedge clk) begin + if(rst) M[j] = '{ default : 0 }; + else if (en) M[j] = zero ? 0 : + SIGNED_ACTIVATIONS ? $signed(a[(IS_MVU ? 0 : SIMD*i) + j]) * $signed(w[i][j]) : + $signed({1'b0, a[(IS_MVU ? 0 : SIMD*i) + j]}) * $signed(w[i][j]); + // (SIGNED_ACTIVATIONS ? $signed(a[(IS_MVU ? 0 : SIMD*i) + j]) : a[(IS_MVU ? 0 : SIMD*i) + j]) * $signed(w[i][j]) isn't valid -- leads to unsigned multiplication + end + assign m1[j] = M[j]; + end : genMreg + else begin : genNoMreg + assign m1[j] = zero ? 0 : + SIGNED_ACTIVATIONS ? $signed(a[(IS_MVU ? 0 : SIMD*i) + j]) * $signed(w[i][j]) : + $signed({1'b0, a[(IS_MVU ? 0 : SIMD*i) + j]}) * $signed(w[i][j]); + end : genNoMreg + end : genSIMD + + // Stage #2: Adder tree to reduce SIMD products + localparam leave_load_t LEAVE_LOAD = SIMD > 1 ? init_leave_loads() : '{ default : 1 }; + localparam int unsigned ROOT_WIDTH = $clog2(SIMD*(2**MULT_WIDTH-1)); + uwire signed [2*SIMD-2:0][ROOT_WIDTH-1:0] tree; + for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = $signed(m1[s]); + for(genvar n = 0; n < SIMD-1; n++) begin + // Sum truncated to actual maximum bit width at this node + localparam int unsigned NODE_WIDTH = $clog2(LEAVE_LOAD[n]*(2**MULT_WIDTH-1)); + uwire signed [NODE_WIDTH-1:0] s = tree[2*n+1] + tree[2*n+2]; + assign tree[n] = s; + end + + // Stage #3: Buffer output + logic [ACCU_WIDTH-1:0] P2 [PE]; + always_ff @(posedge clk) begin + if(rst) P2[i] = '{ default : 0}; + else if (en) P2[i] = (last_i ? 0 : $signed(P2[i])) + $signed(tree[0]); + end + + assign vld = last_i; + assign p[i] = P2[i]; + end : genPE + +endmodule : mvu_vvu_lut From cb434386fa8bf6f63964dd889c8025c3e9616a6b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= Date: Thu, 21 Sep 2023 15:58:34 +0100 Subject: [PATCH 070/123] Revert to proper address truncation without generation bit. --- finn-rtllib/mvu/replay_buffer.sv | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/finn-rtllib/mvu/replay_buffer.sv b/finn-rtllib/mvu/replay_buffer.sv index d4342f705c..3e2766f63d 100644 --- a/finn-rtllib/mvu/replay_buffer.sv +++ b/finn-rtllib/mvu/replay_buffer.sv @@ -144,8 +144,8 @@ module replay_buffer #( uwire wr = irdy && ivld; uwire rd = !OVld || ordy; always_ff @(posedge clk) begin - if(wr) Mem[WP[AWIDTH:0]] <= idat; - if(rd) ODat <= Mem[RP[AWIDTH:0]]; + if(wr) Mem[WP[AWIDTH-1:0]] <= idat; + if(rd) ODat <= Mem[RP[AWIDTH-1:0]]; end uwire vld = (RP != WP); From b4b69f3fa7caae4be9357abf596aff4a66561228 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 21 Sep 2023 17:04:05 +0100 Subject: [PATCH 071/123] remove deletd/renamed files --- finn-rtllib/mvu/mvu_8sx9.sv | 427 ------------------------- finn-rtllib/mvu/mvu_8sx9_axi.sv | 179 ----------- finn-rtllib/mvu/mvu_8sx9_axi_tb.sv | 208 ------------ finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v | 93 ------ finn-rtllib/mvu/mvu_8sx9_tb.sv | 165 ---------- finn-rtllib/mvu/mvu_axi.sv | 248 -------------- finn-rtllib/mvu/mvu_axi_wrapper.v | 92 ------ finn-rtllib/mvu/mvu_lut.sv | 104 ------ finn-rtllib/mvu/tb/mvu_axi_tb.sv | 215 ------------- 9 files changed, 1731 deletions(-) delete mode 100644 finn-rtllib/mvu/mvu_8sx9.sv delete mode 100644 finn-rtllib/mvu/mvu_8sx9_axi.sv delete mode 100644 finn-rtllib/mvu/mvu_8sx9_axi_tb.sv delete mode 100644 finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v delete mode 100644 finn-rtllib/mvu/mvu_8sx9_tb.sv delete mode 100644 finn-rtllib/mvu/mvu_axi.sv delete mode 100644 finn-rtllib/mvu/mvu_axi_wrapper.v delete mode 100644 finn-rtllib/mvu/mvu_lut.sv delete mode 100644 finn-rtllib/mvu/tb/mvu_axi_tb.sv diff --git a/finn-rtllib/mvu/mvu_8sx9.sv b/finn-rtllib/mvu/mvu_8sx9.sv deleted file mode 100644 index 52a93739d6..0000000000 --- a/finn-rtllib/mvu/mvu_8sx9.sv +++ /dev/null @@ -1,427 +0,0 @@ -/****************************************************************************** - * Copyright (C) 2022, Advanced Micro Devices, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, - * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; - * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, - * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR - * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF - * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * @brief Matrix Vector Unit (MVU) core compute kernel utilizing DSP58. - *****************************************************************************/ - -module mvu_vvu_8sx9 #( - parameter IS_MVU, - int unsigned PE, - int unsigned SIMD, - int unsigned ACTIVATION_WIDTH, - int unsigned WEIGHT_WIDTH, - int unsigned ACCU_WIDTH, - bit SIGNED_ACTIVATIONS = 0, - int unsigned SEGMENTLEN = 0, // Default to 0 (which implies a single segment) - bit FORCE_BEHAVIORAL = 0, - - int unsigned ACTIVATION_ELEMENTS = (IS_MVU ? 1 : PE) * SIMD - ) - ( - // Global Control - input logic clk, - input logic rst, - input logic en, - - // Input - input logic last, - input logic zero, // ignore current inputs and force this partial product to zero - input logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] w, // weights - input logic [ACTIVATION_ELEMENTS-1:0][ACTIVATION_WIDTH-1:0] a, // activations - - // Ouput - output logic vld, - output logic [PE-1:0][ACCU_WIDTH-1:0] p - ); - // for verilator always use behavioral code - localparam bit BEHAVIORAL = -`ifdef VERILATOR - 1 || -`endif - FORCE_BEHAVIORAL; - -//-------------------- Declare global signals --------------------\\ - localparam int unsigned CHAINLEN = (SIMD+2)/3; - localparam int unsigned SEGLEN = SEGMENTLEN == 0 ? CHAINLEN : SEGMENTLEN; // Additional constant to default a SEGMENTLEN of '0' to the DSP-chain length - localparam int unsigned PE_ACTIVATION = IS_MVU ? 1 : PE; - uwire [26:0] a_in_i [PE_ACTIVATION * CHAINLEN]; - uwire [23:0] b_in_i [PE][CHAINLEN]; - uwire [PE-1:0][CHAINLEN-1:0][57:0] pcout; // Array with packed dimension > 256 (with a loop-carried dependency) cannot be handled out-of-the-box with PyVerilator - -//-------------------- Shift register for opmode select signal --------------------\\ - localparam int unsigned MAX_PIPELINE_STAGES = (CHAINLEN + SEGLEN-1)/SEGLEN; // >=1 (== number of pipeline registers + 1 (A/B inputs always have 1 register)) - logic L [0:1+MAX_PIPELINE_STAGES] = '{default: 0}; // After MAX_PIPELINE_STAGES (== number of pipeline stages for input data), we have 3 additional cycles latency (A/B reg, Mreg, Preg). Thus, we add +2 (since OPMODE is buffered by 1 cycle in the DSP fabric) - - always_ff @(posedge clk) begin - if(rst) L <= '{default: 0}; - else if(en) begin - L[1+MAX_PIPELINE_STAGES] <= last; - L[0:MAX_PIPELINE_STAGES] <= L[1:1+MAX_PIPELINE_STAGES]; - end - end - assign vld = L[0]; - -//-------------------- Shift register for ZERO flag --------------------\\ - logic Z [0:MAX_PIPELINE_STAGES-2] = '{default:0}; // We need MAX_PIPELINE_STAGES-1 pipeline stages (note: INMODE is buffered inside DSP fabric) - - if (MAX_PIPELINE_STAGES > 1) begin : genZreg - always_ff @(posedge clk) begin - if (rst) Z <= '{default: 0}; - else if(en) begin - Z[0] <= zero; - if (MAX_PIPELINE_STAGES > 2) Z[1:MAX_PIPELINE_STAGES-2] <= Z[0:MAX_PIPELINE_STAGES-3]; - end - end - end; - -//-------------------- Buffer for input activations --------------------\\ - localparam int unsigned PAD_BITS_ACT = 9 - ACTIVATION_WIDTH; - for (genvar k=0; k1 ? TOTAL_PREGS-1 : 0; - localparam int LANES_OCCUPIED = i == CHAINLEN-1 ? SIMD - 3*i : 3; - - if (EXTERNAL_PREGS > 0) begin : genExternalPregAct - logic [0:EXTERNAL_PREGS-1][LANES_OCCUPIED-1:0][ACTIVATION_WIDTH-1:0] A = '{ default : 0}; - always_ff @(posedge clk) begin - if (rst) A <= '{default: 0}; - else if(en) begin - A[EXTERNAL_PREGS-1] <= - // synthesis translate_off - zero ? '1 : - // synthesis translate_on - a[SIMD*k + 3*i +: LANES_OCCUPIED]; - if (EXTERNAL_PREGS > 1) A[0:EXTERNAL_PREGS-2] <= A[1:EXTERNAL_PREGS-1]; - end - end - for (genvar j=0; j1 ? TOTAL_PREGS-1 : 0; - localparam int LANES_OCCUPIED = j == CHAINLEN-1 ? SIMD - 3*j : 3; - - if (EXTERNAL_PREGS > 0) begin : genExternalPregWeight - logic [0:PE-1][0:EXTERNAL_PREGS-1][LANES_OCCUPIED-1:0][WEIGHT_WIDTH-1:0] B = '{ default : 0}; - always_ff @(posedge clk) begin - if (rst) B <= '{default: 0}; - else if (en) begin - B[i][EXTERNAL_PREGS-1] <= -// synthesis translate_off - zero ? '1 : -// synthesis translate_on - w[i][3*j +: LANES_OCCUPIED]; - if (EXTERNAL_PREGS > 1) B[i][0:EXTERNAL_PREGS-2] <= B[i][1:EXTERNAL_PREGS-1]; - end - end - for (genvar k = 0 ; k < LANES_OCCUPIED ; k++) begin : genBin - assign b_in_i[i][j][8*k +: 8] = PAD_BITS_WEIGHT == 0 ? B[i][0][k] : { {PAD_BITS_WEIGHT{B[i][0][k][WEIGHT_WIDTH-1]}}, B[i][0][k] }; - end : genBin - for (genvar k=LANES_OCCUPIED; k<3; k++) begin : genBinZero - assign b_in_i[i][j][8*k +: 8] = 8'b0; - end : genBinZero - end : genExternalPregWeight - else begin : genInpDSPWeight - for (genvar k = 0; k < LANES_OCCUPIED; k++) begin : genBin - assign b_in_i[i][j][8*k +: 8] = -// synthesis translate_off - zero ? '1 : -// synthesis translate_on - PAD_BITS_WEIGHT == 0 ? w[i][3*j+k] : { {PAD_BITS_WEIGHT{w[i][3*j+k][WEIGHT_WIDTH-1]}}, w[i][3*j+k] }; - end : genBin - for (genvar k=LANES_OCCUPIED; k<3; k++) begin : genBinZero - assign b_in_i[i][j][8*k +: 8] = 8'b0; - end : genBinZero - end : genInpDSPWeight - end : genWeightSIMD - end : genWeightPE - -//-------------------- Instantiate PE x CHAINLEN DSPs --------------------\\ - for (genvar i=0; i0 ? 2 : 1; // 1 : 0 - localparam bit PREG = (j+1)%SEGLEN==0 || j == CHAINLEN-1; - localparam bit FIRST = j == 0; - localparam bit LAST = j == CHAINLEN-1; - uwire [57:0] pp; - - if (LAST) begin : genPOUT - assign p[i] = pp[ACCU_WIDTH-1:0]; - end - - // Note: Since the product B * AD is computed, - // rst can be only applied to AD and zero only to B - // with the same effect as zeroing both. - if(BEHAVIORAL) begin : genBehav - // Stage #1: Input A/B - logic signed [33:0] Areg [INTERNAL_PREGS]; - always_ff @(posedge clk) begin - if (rst) Areg <= '{ default : 0}; - else if (en) begin - Areg[0] <= { 7'bx, a_in_i[(IS_MVU ? 0 : CHAINLEN*i) + j] }; - if (INTERNAL_PREGS == 2) Areg[1] <= Areg[0]; - end - end - logic signed [23:0] Breg [INTERNAL_PREGS]; - always_ff @(posedge clk) begin - if (rst) Breg <= '{ default : 0}; - else if (en) begin - Breg[0] <= b_in_i[i][j]; - if (INTERNAL_PREGS == 2) Breg[1] <= Breg[0]; - end - end - - // Stage #2: Multiply-Accumulate - logic signed [57:0] Mreg; - logic InmodeZero = 0; - always_ff @(posedge clk) begin - if (rst) InmodeZero <= 0; - else if (en) InmodeZero <= ( TOTAL_PREGS > 0 ? Z[TOTAL_PREGS-1] : zero ); - end - always_ff @(posedge clk) begin - if (rst) Mreg <= 0; - else if (en) begin - automatic logic signed [57:0] m = 0; - for (int k = 0; k < 3; k++) begin - m = m + (InmodeZero ? 0 : $signed(Areg[INTERNAL_PREGS-1][9*k +: 9]) * $signed(Breg[INTERNAL_PREGS-1][8*k +: 8])); - end - Mreg <= m; - end - end - - // Stage #3: Accumulate - logic signed [57:0] Preg; - logic Opmode = 0; - if (FIRST && !LAST) begin : genFirst - if (PREG) begin : genPregBehav - always_ff @(posedge clk) begin - if (rst) Preg <= 0; - else if (en) Preg <= Mreg; - end - end - else assign Preg = Mreg; - end - else if (FIRST && LAST) begin : genSingle - always_ff @(posedge clk) begin - if (rst) Opmode <= 0; - else if (en) Opmode <= L[1]; - end - always_ff @(posedge clk) begin - if (rst) Preg <= 0; - else if (en) Preg <= (Opmode ? 0 : Preg) + Mreg; - end - end - else if (!FIRST && LAST) begin : genLast - always_ff @(posedge clk) begin - if (rst) Opmode <= 0; - else if (en) Opmode <= L[1]; - end - always_ff @(posedge clk) begin - if (rst) Preg <= 0; - else if (en) Preg <= (Opmode ? 0 : Preg) + Mreg + pcout[i][j-1]; - end - end - else begin : genMid - if (PREG) begin : genPregBehav - always_ff @(posedge clk) begin - if (rst) Preg <= 0; - else if (en) Preg <= Mreg + pcout[i][j-1]; - end - end - else assign Preg = Mreg + pcout[i][j-1]; - end - assign pp = Preg; - assign pcout[i][j] = Preg; - end : genBehav -`ifndef VERILATOR - else begin: genDSP - DSP58 #( - // Feature Control Attributes: Data Path Selection - .AMULTSEL("A"), // Selects A input to multiplier (A, AD) - .A_INPUT("DIRECT"), // Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port) - .BMULTSEL("B"), // Selects B input to multiplier (AD, B) - .B_INPUT("DIRECT"), // Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port) - .DSP_MODE("INT8"), // Configures DSP to a particular mode of operation. Set to INT24 for - // legacy mode. - .PREADDINSEL("A"), // Selects input to pre-adder (A, B) - .RND(58'h000000000000000), // Rounding Constant - .USE_MULT("MULTIPLY"), // Select multiplier usage (DYNAMIC, MULTIPLY, NONE) - .USE_SIMD("ONE58"), // SIMD selection (FOUR12, ONE58, TWO24) - .USE_WIDEXOR("FALSE"), // Use the Wide XOR function (FALSE, TRUE) - .XORSIMD("XOR24_34_58_116"), // Mode of operation for the Wide XOR (XOR12_22, XOR24_34_58_116) - // Pattern Detector Attributes: Pattern Detection Configuration - .AUTORESET_PATDET("NO_RESET"), // NO_RESET, RESET_MATCH, RESET_NOT_MATCH - .AUTORESET_PRIORITY("RESET"), // Priority of AUTORESET vs. CEP (CEP, RESET). - .MASK(58'h0ffffffffffffff), // 58-bit mask value for pattern detect (1=ignore) - .PATTERN(58'h000000000000000), // 58-bit pattern match for pattern detect - .SEL_MASK("MASK"), // C, MASK, ROUNDING_MODE1, ROUNDING_MODE2 - .SEL_PATTERN("PATTERN"), // Select pattern value (C, PATTERN) - .USE_PATTERN_DETECT("NO_PATDET"), // Enable pattern detect (NO_PATDET, PATDET) - // Programmable Inversion Attributes: Specifies built-in programmable inversion on specific pins - .IS_ALUMODE_INVERTED(4'b0000), // Optional inversion for ALUMODE - .IS_CARRYIN_INVERTED(1'b0), // Optional inversion for CARRYIN - .IS_CLK_INVERTED(1'b0), // Optional inversion for CLK - .IS_INMODE_INVERTED(5'b00000), // Optional inversion for INMODE - .IS_NEGATE_INVERTED(3'b000), // Optional inversion for NEGATE - .IS_OPMODE_INVERTED({ LAST ? 2'b01 : 2'b00 , // W: LAST ? (L[1] ? 0 : P) : 0 - FIRST ? 3'b000 : 3'b001, // Z: FIRST ? 0 : PCIN - 2'b01, // Y : M - 2'b01 // X: M - }), // Optional inversion for OPMODE - .IS_RSTALLCARRYIN_INVERTED(1'b0), // Optional inversion for RSTALLCARRYIN - .IS_RSTALUMODE_INVERTED(1'b0), // Optional inversion for RSTALUMODE - .IS_RSTA_INVERTED(1'b0), // Optional inversion for RSTA - .IS_RSTB_INVERTED(1'b0), // Optional inversion for RSTB - .IS_RSTCTRL_INVERTED(1'b0), // Optional inversion for STCONJUGATE_A - .IS_RSTC_INVERTED(1'b0), // Optional inversion for RSTC - .IS_RSTD_INVERTED(1'b0), // Optional inversion for RSTD - .IS_RSTINMODE_INVERTED(1'b0), // Optional inversion for RSTINMODE - .IS_RSTM_INVERTED(1'b0), // Optional inversion for RSTM - .IS_RSTP_INVERTED(1'b0), // Optional inversion for RSTP - // Register Control Attributes: Pipeline Register Configuration - .ACASCREG(INTERNAL_PREGS), // Number of pipeline stages between A/ACIN and ACOUT (0-2) - .ADREG(0), // Pipeline stages for pre-adder (0-1) - .ALUMODEREG(0), // Pipeline stages for ALUMODE (0-1) - .AREG(INTERNAL_PREGS), // Pipeline stages for A (0-2) - .BCASCREG(INTERNAL_PREGS), // Number of pipeline stages between B/BCIN and BCOUT (0-2) - .BREG(INTERNAL_PREGS), // Pipeline stages for B (0-2) - .CARRYINREG(0), // Pipeline stages for CARRYIN (0-1) - .CARRYINSELREG(0), // Pipeline stages for CARRYINSEL (0-1) - .CREG(0), // Pipeline stages for C (0-1) - .DREG(0), // Pipeline stages for D (0-1) - .INMODEREG(1), // Pipeline stages for INMODE (0-1) - .MREG(1), // Multiplier pipeline stages (0-1) - .OPMODEREG(1), // Pipeline stages for OPMODE (0-1) - .PREG(PREG), // Number of pipeline stages for P (0-1) - .RESET_MODE("SYNC") // Selection of synchronous or asynchronous reset. (ASYNC, SYNC). - ) - DSP58_inst ( - // Cascade outputs: Cascade Ports - .ACOUT(), // 34-bit output: A port cascade - .BCOUT(), // 24-bit output: B cascade - .CARRYCASCOUT(), // 1-bit output: Cascade carry - .MULTSIGNOUT(), // 1-bit output: Multiplier sign cascade - .PCOUT(pcout[i][j]), // 58-bit output: Cascade output - // Control outputs: Control Inputs/Status Bits - .OVERFLOW(), // 1-bit output: Overflow in add/acc - .PATTERNBDETECT(), // 1-bit output: Pattern bar detect - .PATTERNDETECT(), // 1-bit output: Pattern detect - .UNDERFLOW(), // 1-bit output: Underflow in add/acc - // Data outputs: Data Ports - .CARRYOUT(), // 4-bit output: Carry - .P(pp), // 58-bit output: Primary data - .XOROUT(), // 8-bit output: XOR data - // Cascade inputs: Cascade Ports - .ACIN('x), // 34-bit input: A cascade data - .BCIN('x), // 24-bit input: B cascade - .CARRYCASCIN('x), // 1-bit input: Cascade carry - .MULTSIGNIN('x), // 1-bit input: Multiplier sign cascade - .PCIN(FIRST ? 'x : pcout[i][j-1]), // 58-bit input: P cascade - // Control inputs: Control Inputs/Status Bits - .ALUMODE(4'h0), // 4-bit input: ALU control - .CARRYINSEL('0), // 3-bit input: Carry select - .CLK(clk), // 1-bit input: Clock - .INMODE({ - INTERNAL_PREGS==2 ? 1'b0 : 1'b1, - 2'b00, - TOTAL_PREGS > 0 ? Z[TOTAL_PREGS-1] : zero, - INTERNAL_PREGS==2 ? 1'b0 : 1'b1 - }), // 5-bit input: INMODE control - .NEGATE('0), // 3-bit input: Negates the input of the multiplier - .OPMODE({ - LAST ? {1'b0, L[1]} : 2'b00, - 7'b000_0000 - }), // 9-bit input: Operation mode - // Data inputs: Data Ports - .A({ 7'bx, a_in_i[(IS_MVU ? 0 : CHAINLEN*i) + j] }), // 34-bit input: A data - .B(b_in_i[i][j]), // 24-bit input: B data - .C('x), // 58-bit input: C data - .CARRYIN('0), // 1-bit input: Carry-in - .D('x), // 27-bit input: D data - // Reset/Clock Enable inputs: Reset/Clock Enable Inputs - .ASYNC_RST('0), // 1-bit input: Asynchronous reset for all registers. - .CEA1(en), // 1-bit input: Clock enable for 1st stage AREG - .CEA2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage AREG - .CEAD('0), // 1-bit input: Clock enable for ADREG - .CEALUMODE('0), // 1-bit input: Clock enable for ALUMODE - .CEB1(en), // 1-bit input: Clock enable for 1st stage BREG - .CEB2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage BREG - .CEC('0), // 1-bit input: Clock enable for CREG - .CECARRYIN('0), // 1-bit input: Clock enable for CARRYINREG - .CECTRL(en), // 1-bit input: Clock enable for OPMODEREG and CARRYINSELREG - .CED('0), // 1-bit input: Clock enable for DREG - .CEINMODE(en), // 1-bit input: Clock enable for INMODEREG - .CEM(en), // 1-bit input: Clock enable for MREG - .CEP(PREG && en), // 1-bit input: Clock enable for PREG - .RSTA(rst), // 1-bit input: Reset for AREG - .RSTALLCARRYIN('0), // 1-bit input: Reset for CARRYINREG - .RSTALUMODE('0), // 1-bit input: Reset for ALUMODEREG - .RSTB(rst), // 1-bit input: Reset for BREG - .RSTC('0), // 1-bit input: Reset for CREG - .RSTCTRL(rst), // 1-bit input: Reset for OPMODEREG and CARRYINSELREG - .RSTD('0), // 1-bit input: Reset for DREG and ADREG - .RSTINMODE(rst), // 1-bit input: Reset for INMODE register - .RSTM(rst), // 1-bit input: Reset for MREG - .RSTP(PREG && rst) // 1-bit input: Reset for PREG - ); - end : genDSP -`endif - end : genDSPChain - end : genDSPPE - -endmodule : mvu_vvu_8sx9 diff --git a/finn-rtllib/mvu/mvu_8sx9_axi.sv b/finn-rtllib/mvu/mvu_8sx9_axi.sv deleted file mode 100644 index 5f215927d8..0000000000 --- a/finn-rtllib/mvu/mvu_8sx9_axi.sv +++ /dev/null @@ -1,179 +0,0 @@ -/****************************************************************************** - * Copyright (C) 2022, Advanced Micro Devices, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, - * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; - * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, - * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR - * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF - * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * @brief Matrix Vector Unit (MVU) AXI-lite interface wrapper. - *****************************************************************************/ - -module mvu_8sx9_axi #( - int unsigned MW, - int unsigned MH, - int unsigned PE, - int unsigned SIMD, - int unsigned ACTIVATION_WIDTH, - int unsigned WEIGHT_WIDTH, - int unsigned ACCU_WIDTH, - bit SIGNED_ACTIVATIONS = 0, - int unsigned SEGMENTLEN = 0, - parameter RAM_STYLE = "auto", - - localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8, - localparam int unsigned INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8, - localparam int unsigned WEIGHT_STREAM_WIDTH = PE*SIMD*WEIGHT_WIDTH, - localparam int unsigned INPUT_STREAM_WIDTH = SIMD*ACTIVATION_WIDTH, - localparam int unsigned SF = MW/SIMD, - localparam int unsigned NF = MH/PE, - localparam int unsigned OUTPUT_LANES = PE, - localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8 -) -( - // Global Control - input logic ap_clk, - input logic ap_rst_n, - - // Weight Stream - input logic [WEIGHT_STREAM_WIDTH_BA-1:0] s_axis_weights_tdata, - input logic s_axis_weights_tvalid, - output logic s_axis_weights_tready, - - // Input Stream - input logic [INPUT_STREAM_WIDTH_BA-1:0] s_axis_input_tdata, - input logic s_axis_input_tvalid, - output logic s_axis_input_tready, - - // Output Stream - output logic [OUTPUT_STREAM_WIDTH_BA-1:0] m_axis_output_tdata, - output logic m_axis_output_tvalid, - input logic m_axis_output_tready -); - -//-------------------- Parameter sanity checks --------------------\\ - initial begin - if (MW % SIMD != 0) begin - $error("Matrix width (%0d) is not a multiple of SIMD (%0d).", MW, SIMD); - $finish; - end - if (MH % PE != 0) begin - $error("Matrix height (%0d) is not a multiple of PE (%0d).", MH, PE); - $finish; - end - if (ACTIVATION_WIDTH > 9) begin - $error("Activation width of %0d-bits exceeds maximum of 9-bits", ACTIVATION_WIDTH); - $finish; - end - if (WEIGHT_WIDTH > 8) begin - $error("Weight width of %0d-bits exceeds maximum of 8-bits", WEIGHT_WIDTH); - $finish; - end - if (SIGNED_ACTIVATIONS == 0 && ACTIVATION_WIDTH==9) begin - $error("Activation width of %0d-bits exceeds maximum of 8-bits for unsigned numbers", ACTIVATION_WIDTH); - $finish; - end - if (SEGMENTLEN == 0) begin - $warning("Segment length of %0d defaults to chain length", SEGMENTLEN); - end - if (SEGMENTLEN > (SIMD+2)/3) begin - $error("Segment length of %0d exceeds chain length of %0d", SEGMENTLEN, (SIMD+2)/3); - $finish; - end - end - - uwire clk = ap_clk; - uwire rst = !ap_rst_n; - - typedef logic [INPUT_STREAM_WIDTH-1 : 0] mvauin_t; - - uwire mvauin_t amvau; - uwire alast; - uwire afin; - uwire avld; - uwire ardy; - - replay_buffer #(.LEN(SF), .REP(NF), .W($bits(mvauin_t)), .RAM_STYLE(RAM_STYLE)) activation_replay ( - .clk, .rst, - .ivld(s_axis_input_tvalid), .irdy(s_axis_input_tready), .idat(mvauin_t'(s_axis_input_tdata)), - .ovld(avld), .ordy(ardy), .odat(amvau), .olast(alast), .ofin(afin) - ); - -//-------------------- Input control --------------------\\ - uwire en; - uwire istb = avld && s_axis_weights_tvalid; - assign ardy = en && s_axis_weights_tvalid; - assign s_axis_weights_tready = en && avld; - -//-------------------- Core MVU --------------------\\ - uwire ovld; - uwire [PE-1:0][57:0] odat; - typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t; - mvu_8sx9 #(.PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), - .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN)) core ( - .clk, .rst, .en, - .last(alast), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau), - .vld(ovld), .p(odat) - ); - -//-------------------- Output register slice --------------------\\ - struct { - logic vld; - logic [PE-1:0][ACCU_WIDTH-1:0] dat; - } A = '{ vld: 0, default: 'x}; - - assign en = !A.vld || !ovld; - - uwire b_load; - always_ff @(posedge clk) begin - if(rst) A <= '{ vld: 0, default: 'x }; - else if(!A.vld || b_load) begin - A.vld <= ovld && en; - for(int unsigned i = 0; i < PE; i++) begin - // CR-1148862: - // A.dat[i] <= odat[i]; - automatic logic [57:0] v = odat[i]; - A.dat[i] <= v[ACCU_WIDTH-1:0]; - end - end - end - - struct { - logic vld; - logic [PE-1:0][ACCU_WIDTH-1:0] dat; - } B = '{ vld: 0, default: 'x}; - - assign b_load = !B.vld || m_axis_output_tready; - always_ff @(posedge clk) begin - if(rst) B <= '{ default: 'x }; - else begin - if(b_load) B <= '{ vld: A.vld, dat: A.dat}; - end - end - - assign m_axis_output_tvalid = B.vld; - assign m_axis_output_tdata = B.dat; - -endmodule \ No newline at end of file diff --git a/finn-rtllib/mvu/mvu_8sx9_axi_tb.sv b/finn-rtllib/mvu/mvu_8sx9_axi_tb.sv deleted file mode 100644 index 70ffa096ef..0000000000 --- a/finn-rtllib/mvu/mvu_8sx9_axi_tb.sv +++ /dev/null @@ -1,208 +0,0 @@ -/****************************************************************************** - * Copyright (C) 2022, Advanced Micro Devices, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, - * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; - * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, - * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR - * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF - * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * @brief Testbench for MVU AXI-lite interface wrapper. - *****************************************************************************/ - -module mvu_8sx9_axi_tb(); - -//-------------------- Simulation parameters --------------------\\ - // Matrix & parallelism config - localparam int unsigned MW = 600; - localparam int unsigned MH = 256; - localparam int unsigned SIMD = 60; - localparam int unsigned PE = 16; - localparam int unsigned SEGMENTLEN = 4; - // Bit-width config - localparam int unsigned ACTIVATION_WIDTH = 8; - localparam int unsigned WEIGHT_WIDTH = 4; - localparam int unsigned ACCU_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW); - localparam bit SIGNED_ACTIVATIONS = 1; - // Simulation constants - localparam int unsigned NF = MH/PE; - localparam int unsigned SF = MW/SIMD; - localparam int unsigned NUM_OF_DSP = SIMD/3; - localparam int unsigned WEIGHT_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8*8; - localparam int unsigned ACTIVATION_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8*8; - localparam int unsigned WEIGHT_WIDTH_BA_DELTA = WEIGHT_WIDTH_BA - PE*SIMD*WEIGHT_WIDTH; - localparam int unsigned ACTIVATION_WIDTH_BA_DELTA = ACTIVATION_WIDTH_BA - SIMD*ACTIVATION_WIDTH; - localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8; - - // Generate clk and reset signal - logic clk = 0; - always #5ns clk = !clk; - - logic ap_rst_n = 0; - initial begin - repeat(16) @(posedge clk); - ap_rst_n <= 1; - end - - uwire ap_clk = clk; - - // Generate activations - typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t; - typedef activation_t activation_vector_t[SF]; - - function activation_vector_t init_ACTIVATIONS; - automatic activation_vector_t res; - std::randomize(res); - return res; - endfunction : init_ACTIVATIONS - - activation_vector_t ACTIVATIONS = init_ACTIVATIONS(); - - struct { - activation_t dat; - logic vld; - logic rdy; - } activations; - - initial begin - activations.vld = 0; - activations.dat = '1; // Since ('X AND 0) will result in 'X in simulation, which would be propagated through the DSP chain - @(posedge clk iff ap_rst_n); - - for (int i=0; i 1; - @(posedge clk); - end while (!(activations.vld === 1 && activations.rdy === 1)); - end - - activations.vld <= 0; - activations.dat <= 'x; - end - - // Generate weights - typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t; - typedef weight_t weight_matrix_t[NF][SF]; - - function weight_matrix_t init_WEIGHTS; - automatic weight_matrix_t res; - std::randomize(res); - return res; - endfunction : init_WEIGHTS; - - weight_matrix_t WEIGHTS = init_WEIGHTS(); - - struct { - weight_t dat; - logic vld; - logic rdy; - } weights; - - initial begin - weights.vld = 0; - weights.dat = '1; // Since ('X AND 0) will result in 'X in simulation, which would be propagated through the DSP chain - @(posedge clk iff ap_rst_n); - - weights.vld <= 1; - for (int i=0; i= 1; - @(posedge clk iff ap_rst_n); - end while (!(outputs.rdy === 1 && outputs.vld === 1)); - - // Compare produced outputs against golden outputs - foreach(outputs.dat[i]) begin - assert ($signed(outputs.dat[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); - else begin - $error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); - $stop; - end - end - - NF_CNT += 1; - end - - $finish; - end - - // Instantiate DUT - mvu_8sx9_axi #( - .MW(MW), - .MH(MH), - .PE(PE), - .SIMD(SIMD), - .ACTIVATION_WIDTH(ACTIVATION_WIDTH), - .WEIGHT_WIDTH(WEIGHT_WIDTH), - .ACCU_WIDTH(ACCU_WIDTH), - .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), - .SEGMENTLEN(SEGMENTLEN) - ) - dut ( - .ap_clk, .ap_rst_n, .s_axis_weights_tdata({ {WEIGHT_WIDTH_BA_DELTA{1'b0}}, weights.dat }), .s_axis_weights_tvalid(weights.vld), - .s_axis_weights_tready(weights.rdy), .s_axis_input_tdata({ {ACTIVATION_WIDTH_BA_DELTA{1'b0}}, activations.dat }), .s_axis_input_tvalid(activations.vld), - .s_axis_input_tready(activations.rdy), .m_axis_output_tdata(outputs.dat), .m_axis_output_tvalid(outputs.vld), - .m_axis_output_tready(outputs.rdy) - ); - -endmodule diff --git a/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v b/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v deleted file mode 100644 index e15f77fbae..0000000000 --- a/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v +++ /dev/null @@ -1,93 +0,0 @@ -/****************************************************************************** - * Copyright (C) 2022, Advanced Micro Devices, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, - * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; - * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, - * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR - * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF - * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * @brief Verilog AXI-lite wrapper for MVU. - *****************************************************************************/ - -module $MODULE_NAME_AXI_WRAPPER$ #( - parameter MW = $MW$, - parameter MH = $MH$, - parameter PE = $PE$, - parameter SIMD = $SIMD$, - parameter ACTIVATION_WIDTH = $ACTIVATION_WIDTH$, - parameter WEIGHT_WIDTH = $WEIGHT_WIDTH$, - parameter ACCU_WIDTH = $ACCU_WIDTH$, - parameter SIGNED_ACTIVATIONS = $SIGNED_ACTIVATIONS$, - parameter SEGMENTLEN = $SEGMENTLEN$, - parameter RAM_STYLE = "$IBUF_RAM_STYLE$", - - // Safely deducible parameters - parameter WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8, - parameter INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8, - parameter OUTPUT_LANES = PE, - parameter OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8 -)( - // Global Control - (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF s_axis_weights:s_axis_input:m_axis_output, ASSOCIATED_RESET ap_rst_n" *) - (* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk CLK" *) - input ap_clk, - (* X_INTERFACE_PARAMETER = "POLARITY ACTIVE_LOW" *) - input ap_rst_n, - - // Weight Stream - input [WEIGHT_STREAM_WIDTH_BA-1:0] s_axis_weights_tdata, - input s_axis_weights_tvalid, - output s_axis_weights_tready, - - // Input Stream - input [INPUT_STREAM_WIDTH_BA-1:0] s_axis_input_tdata, - input s_axis_input_tvalid, - output s_axis_input_tready, - - // Output Stream - output [OUTPUT_STREAM_WIDTH_BA-1:0] m_axis_output_tdata, - output m_axis_output_tvalid, - input m_axis_output_tready -); - -mvu_8sx9_axi #( - .MW(MW), .MH(MH), .PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), - .WEIGHT_WIDTH(WEIGHT_WIDTH), .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), - .SEGMENTLEN(SEGMENTLEN), .RAM_STYLE(RAM_STYLE) - ) inst ( - .ap_clk(ap_clk), - .ap_rst_n(ap_rst_n), - .s_axis_weights_tdata(s_axis_weights_tdata), - .s_axis_weights_tvalid(s_axis_weights_tvalid), - .s_axis_weights_tready(s_axis_weights_tready), - .s_axis_input_tdata(s_axis_input_tdata), - .s_axis_input_tvalid(s_axis_input_tvalid), - .s_axis_input_tready(s_axis_input_tready), - .m_axis_output_tdata(m_axis_output_tdata), - .m_axis_output_tvalid(m_axis_output_tvalid), - .m_axis_output_tready(m_axis_output_tready) -); - -endmodule : $MODULE_NAME_AXI_WRAPPER$ diff --git a/finn-rtllib/mvu/mvu_8sx9_tb.sv b/finn-rtllib/mvu/mvu_8sx9_tb.sv deleted file mode 100644 index adf6a8f9c2..0000000000 --- a/finn-rtllib/mvu/mvu_8sx9_tb.sv +++ /dev/null @@ -1,165 +0,0 @@ -/****************************************************************************** - * Copyright (C) 2022, Advanced Micro Devices, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, - * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; - * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, - * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR - * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF - * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * @brief Testbench for MVU core compute kernel. - *****************************************************************************/ - -module mvu_8sx9_tb(); - -//-------------------- Simulation parameters --------------------\\ - // Matrix & parallelism config - localparam int unsigned MH = 256; - localparam int unsigned PE = 16; - localparam int unsigned MW = 600; - localparam int unsigned SIMD = 60; - localparam int unsigned SEGMENTLEN = 4; - // Bit-width config - localparam int unsigned ACTIVATION_WIDTH = 8; - localparam int unsigned WEIGHT_WIDTH = 4; - localparam bit SIGNED_ACTIVATIONS = 1; - // Simulation constants - localparam int unsigned NF = MH/PE; - localparam int unsigned SF = MW/SIMD; - localparam int unsigned NUM_OF_DSP = SIMD/3; - - typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t; - typedef activation_t activation_vector_t[SF]; - - function activation_vector_t init_ACTIVATIONS; - automatic activation_vector_t res; - std::randomize(res); - return res; - endfunction : init_ACTIVATIONS - - typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t; - typedef weight_t weight_matrix_t[NF][SF]; - - function weight_matrix_t init_WEIGHTS; - automatic weight_matrix_t res; - std::randomize(res); - return res; - endfunction : init_WEIGHTS; - - typedef logic signed [PE-1:0][57:0] output_t; - typedef output_t output_vector_t [NF]; - - function output_vector_t check_output(activation_vector_t a, weight_matrix_t w); - automatic output_vector_t res = '{default: 0}; - for (int j = 0; j 1) && !rst; - end - - // Compare computed output against golden output when vld flag is raised by DUT - always_ff @(posedge clk iff (vld && en)) begin - foreach(p[i]) begin - assert ($signed(p[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(p[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); - else begin - $error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(p[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); - $stop; - end - end - NF_CNT += 1; - end - - // Instantiate DUT - mvu_8sx9 #( - .PE(PE), - .SIMD(SIMD), - .WEIGHT_WIDTH(WEIGHT_WIDTH), - .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), - .ACTIVATION_WIDTH(ACTIVATION_WIDTH), - .SEGMENTLEN(SEGMENTLEN) - ) - dut ( - .clk, .rst, .en, .last, .zero, .a, .w, .vld, .p - ); - -endmodule diff --git a/finn-rtllib/mvu/mvu_axi.sv b/finn-rtllib/mvu/mvu_axi.sv deleted file mode 100644 index 07ad32e6c8..0000000000 --- a/finn-rtllib/mvu/mvu_axi.sv +++ /dev/null @@ -1,248 +0,0 @@ -/****************************************************************************** - * Copyright (C) 2022, Advanced Micro Devices, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, - * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; - * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, - * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR - * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF - * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * @brief Matrix Vector Unit (MVU) & Vector Vector Unit (VVU) AXI-lite interface wrapper. - * @details - * The following compute cores are supported: - * - 4-bit MVU on DSP48 & DSP58 achieving 4 MACs/DSP, - * (4,8]-bit MVU on DSP48 achieving 2 MACs/DSP, - * [4,9]-bit MVU and VVU on DSP58 achieving 3 MACs/DSP, - * 'unconstrained' LUT-based MVU and VVU. - * Folding hints: - * - PE scaling should divide MH. - * - SIMD scaling should divide MW. - * - Otherwise, keep SIMD and PE somewhat balanced. SIMD scaling tends to - * impact critical paths more than PE scaling. PE scaling implies a - * bigger fanout on the input activations. - * - Full unfolding along MH (PE=MH) results in no replay buffer instantiated - *****************************************************************************/ - -module mvu_vvu_axi #( - bit IS_MVU, // string type causes error in Vivado - parameter COMPUTE_CORE, - int unsigned MW, - int unsigned MH, - int unsigned PE, - int unsigned SIMD, - int unsigned ACTIVATION_WIDTH, - int unsigned WEIGHT_WIDTH, - int unsigned ACCU_WIDTH, - bit SIGNED_ACTIVATIONS = 0, - int unsigned SEGMENTLEN = 0, - bit FORCE_BEHAVIORAL = 0, - bit M_REG_LUT = 1, - - // Safely deducible parameters - localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8, - localparam int unsigned INPUT_STREAM_WIDTH_BA = ((IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8, - localparam int unsigned WEIGHT_STREAM_WIDTH = PE*SIMD*WEIGHT_WIDTH, - localparam int unsigned INPUT_STREAM_WIDTH = (IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH, - localparam int unsigned SF = MW/SIMD, - localparam int unsigned NF = MH/PE, - localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8 -) -( - // Global Control - input logic ap_clk, - input logic ap_rst_n, - - // Weight Stream - input logic [WEIGHT_STREAM_WIDTH_BA-1:0] s_axis_weights_tdata, - input logic s_axis_weights_tvalid, - output logic s_axis_weights_tready, - - // Input Stream - input logic [INPUT_STREAM_WIDTH_BA-1:0] s_axis_input_tdata, - input logic s_axis_input_tvalid, - output logic s_axis_input_tready, - - // Output Stream - output logic [OUTPUT_STREAM_WIDTH_BA-1:0] m_axis_output_tdata, - output logic m_axis_output_tvalid, - input logic m_axis_output_tready -); - -//-------------------- Parameter sanity checks --------------------\\ - initial begin - if (MW % SIMD != 0) begin - $error("Matrix width (%0d) is not a multiple of SIMD (%0d).", MW, SIMD); - $finish; - end - if (MH % PE != 0) begin - $error("Matrix height (%0d) is not a multiple of PE (%0d).", MH, PE); - $finish; - end - if (WEIGHT_WIDTH > 8) begin - $error("Weight width of %0d-bits exceeds maximum of 8-bits", WEIGHT_WIDTH); - $finish; - end - if (ACTIVATION_WIDTH > 8) begin - if (!(SIGNED_ACTIVATIONS == 1 && ACTIVATION_WIDTH == 9 && COMPUTE_CORE == "mvu_vvu_8sx9_dsp58")) begin - $error("Activation width of %0d-bits exceeds maximum of 9-bits for signed numbers on DSP48", ACTIVATION_WIDTH); - $finish; - end - end - if (COMPUTE_CORE == "mvu_vvu_8sx9_dsp58") begin - if (SEGMENTLEN == 0) begin - $warning("Segment length of %0d defaults to chain length of %0d", SEGMENTLEN, (SIMD+2)/3); - end - if (SEGMENTLEN > (SIMD+2)/3) begin - $error("Segment length of %0d exceeds chain length of %0d", SEGMENTLEN, (SIMD+2)/3); - $finish; - end - end - if (!IS_MVU) begin - if (COMPUTE_CORE != "mvu_vvu_8sx9_dsp58" && COMPUTE_CORE != "mvu_vvu_lut") begin - $error("VVU only supported on DSP58 or LUT-based implementation"); - $finish; - end - end - end - - uwire clk = ap_clk; - uwire rst = !ap_rst_n; - - typedef logic [INPUT_STREAM_WIDTH-1 : 0] mvauin_t; - - uwire mvauin_t amvau; - uwire alast; - uwire afin; - uwire avld; - uwire ardy; - - replay_buffer #(.LEN(SF), .REP(IS_MVU ? NF : 1), .W($bits(mvauin_t))) activation_replay ( - .clk, .rst, - .ivld(s_axis_input_tvalid), .irdy(s_axis_input_tready), .idat(mvauin_t'(s_axis_input_tdata)), - .ovld(avld), .ordy(ardy), .odat(amvau), .olast(alast), .ofin(afin) - ); - -//-------------------- Input control --------------------\\ - uwire en; - uwire istb = avld && s_axis_weights_tvalid; - assign ardy = en && s_axis_weights_tvalid; - assign s_axis_weights_tready = en && avld; - -//-------------------- Core MVU/VVU --------------------\\ - uwire ovld; - uwire [PE-1:0][ACCU_WIDTH-1:0] odat; - typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t; - uwire mvauin_t amvau_i; - - if (IS_MVU) begin : genMVUInput - assign amvau_i = amvau; - end : genMVUInput - else begin : genVVUInput - // The input stream will have the channels interleaved for VVU when PE>1 - // Hence, we need to 'untangle' the input stream, i.e. [..][SIMD*PE][..] --> [..][PE][SIMD][..] - // Note that for each 'SIMD' (S) and 'PE' (P) element, we have something like: - // (S_0, P_0), ..., (S_0, P_i), (S_1, P_0), ..., (S_1, P_i), ..., (S_i, P_i) which we need to 'untangle' to - // (S_0, P_0), ..., (S_i, P_0), (S_0, P_1), ..., (S_i,, P_1), ..., (S_i, P_i) - localparam int num_of_elements = INPUT_STREAM_WIDTH/ACTIVATION_WIDTH; - for (genvar i=0; i 1) ? - amvau[(i/SIMD + (i*PE % num_of_elements) + 1) * ACTIVATION_WIDTH : (i/SIMD + (i*PE % num_of_elements)) * ACTIVATION_WIDTH] - : amvau[i*ACTIVATION_WIDTH +: ACTIVATION_WIDTH]; - end : genRewire - end : genVVUInput - - case(COMPUTE_CORE) - "mvu_vvu_8sx9_dsp58": - mvu_vvu_8sx9 #(.IS_MVU(IS_MVU), .PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), - .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN), - .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( - .clk, .rst, .en, - .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i), - .vld(ovld), .p(odat) - ); - "mvu_4sx4u": - mvu_4sx4u #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( - .clk, .rst, .en, - .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i), - .vld(ovld), .p(odat) - ); - "mvu_8sx8u_dsp48": - mvu_8sx8u_dsp48 #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), - .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( - .clk, .rst, .en, - .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i), - .vld(ovld), .p(odat) - ); - "mvu_vvu_lut": - mvu_vvu_lut #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), - .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .M_REG(M_REG_LUT)) core ( - .clk, .rst, .en, - .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i), - .vld(ovld), .p(odat) - ); - default: initial begin - $error("Unrecognized COMPUTE_CORE '%s'", COMPUTE_CORE); - $finish; - end - endcase - -//-------------------- Output register slice --------------------\\ - struct packed { - logic vld; - logic [PE-1:0][ACCU_WIDTH-1:0] dat; - } A = '{ vld: 0, default: 'x}; - - assign en = !A.vld || !ovld; - - uwire b_load; - always_ff @(posedge clk) begin - if(rst) A <= '{ vld: 0, default: 'x }; - else if(!A.vld || b_load) begin - A.vld <= ovld && en; - for(int unsigned i = 0; i < PE; i++) begin - // CR-1148862: - // A.dat[i] <= odat[i]; - automatic logic [ACCU_WIDTH-1:0] v = odat[i]; - A.dat[i] <= v[ACCU_WIDTH-1:0]; - end - end - end - - struct packed { - logic vld; - logic [PE-1:0][ACCU_WIDTH-1:0] dat; - } B = '{ vld: 0, default: 'x}; - - assign b_load = !B.vld || m_axis_output_tready; - always_ff @(posedge clk) begin - if(rst) B <= '{ vld: 0, default: 'x }; - else begin - if(b_load) B <= '{ vld: A.vld, dat: A.dat}; - end - end - - assign m_axis_output_tvalid = B.vld; - assign m_axis_output_tdata = B.dat; - -endmodule : mvu_vvu_axi diff --git a/finn-rtllib/mvu/mvu_axi_wrapper.v b/finn-rtllib/mvu/mvu_axi_wrapper.v deleted file mode 100644 index 239c5bbacd..0000000000 --- a/finn-rtllib/mvu/mvu_axi_wrapper.v +++ /dev/null @@ -1,92 +0,0 @@ -/****************************************************************************** - * Copyright (C) 2022, Advanced Micro Devices, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, - * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; - * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, - * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR - * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF - * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * @brief Verilog AXI-lite wrapper for MVU. - *****************************************************************************/ - -module $MODULE_NAME_AXI_WRAPPER$ #( - parameter MW = $MW$, - parameter MH = $MH$, - parameter PE = $PE$, - parameter SIMD = $SIMD$, - parameter ACTIVATION_WIDTH = $ACTIVATION_WIDTH$, - parameter WEIGHT_WIDTH = $WEIGHT_WIDTH$, - parameter ACCU_WIDTH = $ACCU_WIDTH$, - parameter SIGNED_ACTIVATIONS = $SIGNED_ACTIVATIONS$, - parameter SEGMENTLEN = $SEGMENTLEN$, - parameter MVU_IMPL_STYLE = "$MVU_IMPL_STYLE$", - parameter FORCE_BEHAVIORAL = $FORCE_BEHAVIORAL$, - - // Safely deducible parameters - parameter WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8, - parameter INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8, - parameter OUTPUT_LANES = PE, - parameter OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8 -)( - // Global Control - (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF weights_V:in0_V:out_V, ASSOCIATED_RESET ap_rst_n" *) - (* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk CLK" *) - input ap_clk, - (* X_INTERFACE_PARAMETER = "POLARITY ACTIVE_LOW" *) - input ap_rst_n, - - // Weight Stream - input [WEIGHT_STREAM_WIDTH_BA-1:0] weights_V_TDATA, - input weights_V_TVALID, - output weights_V_TREADY, - // Input Stream - input [INPUT_STREAM_WIDTH_BA-1:0] in0_V_TDATA, - input in0_V_TVALID, - output in0_V_TREADY, - // Output Stream - output [OUTPUT_STREAM_WIDTH_BA-1:0] out_V_TDATA, - output out_V_TVALID, - input out_V_TREADY -); - -mvu_axi #( - .MW(MW), .MH(MH), .PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), - .WEIGHT_WIDTH(WEIGHT_WIDTH), .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), - .SEGMENTLEN(SEGMENTLEN), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL), .MVU_IMPL_STYLE(MVU_IMPL_STYLE) - ) inst ( - .ap_clk(ap_clk), - .ap_rst_n(ap_rst_n), - .s_axis_weights_tdata(weights_V_TDATA), - .s_axis_weights_tvalid(weights_V_TVALID), - .s_axis_weights_tready(weights_V_TREADY), - .s_axis_input_tdata(in0_V_TDATA), - .s_axis_input_tvalid(in0_V_TVALID), - .s_axis_input_tready(in0_V_TREADY), - .m_axis_output_tdata(out_V_TDATA), - .m_axis_output_tvalid(out_V_TVALID), - .m_axis_output_tready(out_V_TREADY) -); - -endmodule : $MODULE_NAME_AXI_WRAPPER$ diff --git a/finn-rtllib/mvu/mvu_lut.sv b/finn-rtllib/mvu/mvu_lut.sv deleted file mode 100644 index c100910d75..0000000000 --- a/finn-rtllib/mvu/mvu_lut.sv +++ /dev/null @@ -1,104 +0,0 @@ -module mvu_vvu_lut #( - bit IS_MVU, - int unsigned PE, - int unsigned SIMD, - int unsigned ACCU_WIDTH, - int unsigned ACTIVATION_WIDTH, - int unsigned WEIGHT_WIDTH, - bit SIGNED_ACTIVATIONS, - bit M_REG = 1, - - localparam int unsigned MULT_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH, - localparam int unsigned ACTIVATION_ELEMENTS = (IS_MVU ? 1 : PE) * SIMD -)( - // Global Control - input logic clk, - input logic rst, - input logic en, - - // Input - input logic last, - input logic zero, // ignore current inputs and force this partial product to zero - input logic signed [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] w, // signed weights - input logic [ACTIVATION_ELEMENTS-1:0][ACTIVATION_WIDTH-1:0] a, // (un)signed activations - - // Ouput - output logic vld, - output logic signed [PE-1:0][ACCU_WIDTH-1:0] p -); - - typedef int unsigned leave_load_t[2*SIMD-1]; - function leave_load_t init_leave_loads(); - automatic leave_load_t res; - for(int i = 2*(SIMD-1); i >= int'(SIMD)-1; i--) res[i] = 1; - for(int i = SIMD-2; i >= 0; i--) res[i] = res[2*i+1] + res[2*i+2]; - return res; - endfunction : init_leave_loads - - // Pipeline for last indicator flag - uwire last_i; - generate if (M_REG) begin - logic [0:1] L = '0; - always_ff @(posedge clk) begin - if(rst) L <= '0; - else if (en) L <= {last, L[0]}; - end - assign last_i = L[1]; - end - else begin - logic L = '0; - always_ff @(posedge clk) begin - if(rst) L <= '0; - else if (en) L <= last; - end - assign last_i = L; - end - endgenerate - - // For each PE generate - for (genvar i = 0; i < PE; i++) begin : genPE - // Stage #1: SIMD multipliers in parallel - uwire [MULT_WIDTH-1 : 0] m1 [SIMD]; - for (genvar j = 0; j < SIMD; j++) begin : genSIMD - if (M_REG) begin : genMreg - logic [MULT_WIDTH-1 : 0] M [SIMD]; - always_ff @(posedge clk) begin - if(rst) M[j] = '{ default : 0 }; - else if (en) M[j] = zero ? 0 : - SIGNED_ACTIVATIONS ? $signed(a[(IS_MVU ? 0 : SIMD*i) + j]) * $signed(w[i][j]) : - $signed({1'b0, a[(IS_MVU ? 0 : SIMD*i) + j]}) * $signed(w[i][j]); - // (SIGNED_ACTIVATIONS ? $signed(a[(IS_MVU ? 0 : SIMD*i) + j]) : a[(IS_MVU ? 0 : SIMD*i) + j]) * $signed(w[i][j]) isn't valid -- leads to unsigned multiplication - end - assign m1[j] = M[j]; - end : genMreg - else begin : genNoMreg - assign m1[j] = zero ? 0 : - SIGNED_ACTIVATIONS ? $signed(a[(IS_MVU ? 0 : SIMD*i) + j]) * $signed(w[i][j]) : - $signed({1'b0, a[(IS_MVU ? 0 : SIMD*i) + j]}) * $signed(w[i][j]); - end : genNoMreg - end : genSIMD - - // Stage #2: Adder tree to reduce SIMD products - localparam leave_load_t LEAVE_LOAD = SIMD > 1 ? init_leave_loads() : '{ default : 1 }; - localparam int unsigned ROOT_WIDTH = $clog2(SIMD*(2**MULT_WIDTH-1)); - uwire signed [2*SIMD-2:0][ROOT_WIDTH-1:0] tree; - for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = $signed(m1[s]); - for(genvar n = 0; n < SIMD-1; n++) begin - // Sum truncated to actual maximum bit width at this node - localparam int unsigned NODE_WIDTH = $clog2(LEAVE_LOAD[n]*(2**MULT_WIDTH-1)); - uwire signed [NODE_WIDTH-1:0] s = tree[2*n+1] + tree[2*n+2]; - assign tree[n] = s; - end - - // Stage #3: Buffer output - logic [ACCU_WIDTH-1:0] P2 [PE]; - always_ff @(posedge clk) begin - if(rst) P2[i] = '{ default : 0}; - else if (en) P2[i] = (last_i ? 0 : $signed(P2[i])) + $signed(tree[0]); - end - - assign vld = last_i; - assign p[i] = P2[i]; - end : genPE - -endmodule : mvu_vvu_lut diff --git a/finn-rtllib/mvu/tb/mvu_axi_tb.sv b/finn-rtllib/mvu/tb/mvu_axi_tb.sv deleted file mode 100644 index b89b58f55b..0000000000 --- a/finn-rtllib/mvu/tb/mvu_axi_tb.sv +++ /dev/null @@ -1,215 +0,0 @@ -/****************************************************************************** - * Copyright (C) 2022, Advanced Micro Devices, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, - * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; - * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, - * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR - * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF - * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * @brief Testbench for MVU AXI-lite interface wrapper. - *****************************************************************************/ - -module mvu_axi_tb(); - -//-------------------- Simulation parameters --------------------\\ - // Matrix & parallelism config - localparam int unsigned MW = 50; - localparam int unsigned MH = 8; - localparam int unsigned SIMD = 10; - localparam int unsigned PE = 2; - localparam int unsigned SEGMENTLEN = 2; - localparam string MVU_IMPL_STYLE = "mvu_8sx8u_dsp48"; - localparam bit FORCE_BEHAVIORAL = 1; - // Bit-width config - localparam int unsigned ACTIVATION_WIDTH = 8; - localparam int unsigned WEIGHT_WIDTH = 8; - localparam int unsigned ACCU_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW); - localparam bit SIGNED_ACTIVATIONS = 0; - // Simulation constants - localparam int unsigned NF = MH/PE; - localparam int unsigned SF = MW/SIMD; - localparam int unsigned NUM_OF_DSP = SIMD/3; - localparam int unsigned WEIGHT_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8*8; - localparam int unsigned ACTIVATION_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8*8; - localparam int unsigned WEIGHT_WIDTH_BA_DELTA = WEIGHT_WIDTH_BA - PE*SIMD*WEIGHT_WIDTH; - localparam int unsigned ACTIVATION_WIDTH_BA_DELTA = ACTIVATION_WIDTH_BA - SIMD*ACTIVATION_WIDTH; - localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8; - - // Generate clk and reset signal - logic clk = 0; - always #5ns clk = !clk; - - logic ap_rst_n = 0; - initial begin - repeat(16) @(posedge clk); - ap_rst_n <= 1; - end - - uwire ap_clk = clk; - - // Generate activations - typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t; - typedef activation_t activation_vector_t[SF]; - - function activation_vector_t init_ACTIVATIONS; - automatic activation_vector_t res; - std::randomize(res); - return res; - endfunction : init_ACTIVATIONS - - activation_vector_t ACTIVATIONS = init_ACTIVATIONS(); - - struct { - activation_t dat; - logic vld; - logic rdy; - } activations; - - initial begin - activations.vld = 0; - activations.dat = '1; // Since ('X AND 0) will result in 'X in simulation, which would be propagated through the DSP chain - @(posedge clk iff ap_rst_n); - - for (int i=0; i= 1; - @(posedge clk); - end while (!(activations.vld === 1 && activations.rdy === 1)); - end - - activations.vld <= 0; - activations.dat <= 'x; - end - - // Generate weights - typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t; - typedef weight_t weight_matrix_t[NF][SF]; - - function weight_matrix_t init_WEIGHTS; - automatic weight_matrix_t res; - std::randomize(res); - return res; - endfunction : init_WEIGHTS; - - weight_matrix_t WEIGHTS = init_WEIGHTS(); - - struct { - weight_t dat; - logic vld; - logic rdy; - } weights; - - initial begin - weights.vld = 0; - weights.dat = '1; // Since ('X AND 0) will result in 'X in simulation, which would be propagated through the DSP chain - @(posedge clk iff ap_rst_n); - - weights.vld <= 1; - for (int i=0; i= 1; - @(posedge clk iff ap_rst_n); - end while (!(outputs.rdy === 1 && outputs.vld === 1)); - - // Compare produced outputs against golden outputs - foreach(outputs.dat[i]) begin - assert ($signed(outputs.dat[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); - else begin - $error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); - $stop; - end - end - - NF_CNT += 1; - end - - $finish; - end - - // Instantiate DUT - mvu_axi #( - .MW(MW), - .MH(MH), - .PE(PE), - .SIMD(SIMD), - .ACTIVATION_WIDTH(ACTIVATION_WIDTH), - .WEIGHT_WIDTH(WEIGHT_WIDTH), - .ACCU_WIDTH(ACCU_WIDTH), - .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), - .SEGMENTLEN(SEGMENTLEN), - .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL), - .MVU_IMPL_STYLE(MVU_IMPL_STYLE) - ) - dut ( - .ap_clk, .ap_rst_n, .s_axis_weights_tdata({ {WEIGHT_WIDTH_BA_DELTA{1'b0}}, weights.dat }), .s_axis_weights_tvalid(weights.vld), - .s_axis_weights_tready(weights.rdy), .s_axis_input_tdata({ {ACTIVATION_WIDTH_BA_DELTA{1'b0}}, activations.dat }), .s_axis_input_tvalid(activations.vld), - .s_axis_input_tready(activations.rdy), .m_axis_output_tdata(outputs.dat), .m_axis_output_tvalid(outputs.vld), - .m_axis_output_tready(outputs.rdy) - ); - -endmodule : mvu_axi_tb From 14c5fa902820396e3489a244dc4d705fd1ebe532 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 21 Sep 2023 17:12:47 +0100 Subject: [PATCH 072/123] [mvu vvu 8sx9]: renamed for consistency --- finn-rtllib/mvu/{mvu_vvu_8sx9.sv => mvu_vvu_8sx9_dsp58.sv} | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) rename finn-rtllib/mvu/{mvu_vvu_8sx9.sv => mvu_vvu_8sx9_dsp58.sv} (99%) diff --git a/finn-rtllib/mvu/mvu_vvu_8sx9.sv b/finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv similarity index 99% rename from finn-rtllib/mvu/mvu_vvu_8sx9.sv rename to finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv index 2aa9d71b6c..6ae117e3ab 100644 --- a/finn-rtllib/mvu/mvu_vvu_8sx9.sv +++ b/finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv @@ -31,7 +31,7 @@ * @brief Matrix Vector Unit (MVU) core compute kernel utilizing DSP58. *****************************************************************************/ -module mvu_vvu_8sx9 #( +module mvu_vvu_8sx9_dsp58 #( bit IS_MVU, int unsigned PE, int unsigned SIMD, @@ -424,4 +424,4 @@ module mvu_vvu_8sx9 #( end : genDSPChain end : genDSPPE -endmodule : mvu_vvu_8sx9 +endmodule : mvu_vvu_8sx9_dsp58 From 3a3758826512fd3d5ed0bcdd23358d5fd5b724cd Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 21 Sep 2023 17:13:25 +0100 Subject: [PATCH 073/123] [mvu vvu axi]: changes for renamed module --- finn-rtllib/mvu/mvu_vvu_axi.sv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv index ff677fc244..416480da79 100644 --- a/finn-rtllib/mvu/mvu_vvu_axi.sv +++ b/finn-rtllib/mvu/mvu_vvu_axi.sv @@ -174,7 +174,7 @@ module mvu_vvu_axi #( case(COMPUTE_CORE) "mvu_vvu_8sx9_dsp58": - mvu_vvu_8sx9 #(.IS_MVU(IS_MVU), .PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), + mvu_vvu_8sx9_dsp58 #(.IS_MVU(IS_MVU), .PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( .clk, .rst, .en, From afe36baa134b947718db34d140c8d6500b91cb2a Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 25 Sep 2023 13:44:17 +0100 Subject: [PATCH 074/123] [mvu vvu wrapper]: convert localparam to param --- finn-rtllib/mvu/mvu_vvu_axi_wrapper.v | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v index 270fe7351f..9c65dbc06e 100644 --- a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v +++ b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v @@ -46,9 +46,9 @@ module $MODULE_NAME_AXI_WRAPPER$ #( parameter FORCE_BEHAVIORAL = $FORCE_BEHAVIORAL$, // Safely deducible parameters - localparam WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8, - localparam INPUT_STREAM_WIDTH_BA = ((IS_MVU == 1 ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8, - localparam OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8 + parameter WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8, + parameter INPUT_STREAM_WIDTH_BA = ((IS_MVU == 1 ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8, + parameter OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8 )( // Global Control (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF weights_V:in0_V:out_V, ASSOCIATED_RESET ap_rst_n" *) From e4f2f9e0e4f1cb0bae2bf7e439c57356b3670620 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 25 Sep 2023 13:45:48 +0100 Subject: [PATCH 075/123] [mvau-rtl custom-op]: bugfix to instantiate memstreamer, modified renamed files and axi wrapper template fill-out --- .../matrixvectoractivation_rtl.py | 92 ++++++++++--------- 1 file changed, 51 insertions(+), 41 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py index 9f8130806b..c7fb855884 100644 --- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py +++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py @@ -61,8 +61,7 @@ class MatrixVectorActivation_rtl(HLSCustomOp): - """Class that corresponds to finn-hls Matrix_Vector_Activate(_Stream)_Batch - function.""" + """Class that corresponds to finn-rtl Matrix Vector Unit.""" def __init__(self, onnx_node, **kwargs): super().__init__(onnx_node, **kwargs) @@ -73,8 +72,7 @@ def get_nodeattr_types(self): "SIMD": ("i", True, 0), "MW": ("i", True, 0), "MH": ("i", True, 0), - "resType": ("s", False, "lut", {"auto", "lut", "dsp"}), - "ActVal": ("i", False, 0), + "resType": ("s", False, "dsp", {"auto", "lut", "dsp"}), # FINN DataTypes for inputs, weights, outputs "inputDataType": ("s", True, ""), "weightDataType": ("s", True, ""), @@ -165,7 +163,6 @@ def verify_node(self): # verify that all necessary attributes exist # TODO collect automatically from get_nodeattr_types try: - self.get_nodeattr("code_gen_dir_cppsim") self.get_nodeattr("executable_path") self.get_nodeattr("resType") self.get_nodeattr("MW") @@ -199,7 +196,6 @@ def verify_node(self): return info_messages - # TODO: Add in replay_buffer estimation def uram_estimation(self): P = self.get_nodeattr("PE") Q = self.get_nodeattr("SIMD") @@ -213,7 +209,6 @@ def uram_estimation(self): mstyle = self.get_nodeattr("ram_style") if ( (mmode == "decoupled" and mstyle != "ultra") - or (mmode == "const" and self.calc_wmem() <= 128) or (mmode == "external") ): return 0 @@ -221,7 +216,6 @@ def uram_estimation(self): depth_multiplier = math.ceil(omega / 4096) return width_multiplier * depth_multiplier - # TODO: Add in replay_buffer estimation def bram_estimation(self): """Calculates resource estimation for BRAM based on: - FINN-R: An End-to-End Deep-Learning Framework for Fast @@ -243,7 +237,6 @@ def bram_estimation(self): mstyle = self.get_nodeattr("ram_style") if ( (mmode == "decoupled" and mstyle in ["distributed", "ultra"]) - or (mmode == "const" and self.calc_wmem() <= 128) or (mmode == "external") ): return 0 @@ -262,7 +255,6 @@ def bram_estimation(self): else: return (math.ceil(omega / 512)) * (math.ceil(mem_width / 36)) - # TODO: Add in replay_buffer estimation def bram_efficiency_estimation(self): wdt = self.get_weight_datatype() W = wdt.bitwidth() @@ -275,7 +267,6 @@ def bram_efficiency_estimation(self): bram16_est_capacity = bram16_est * 36 * 512 return wbits / bram16_est_capacity - # TODO: Add in replay_buffer estimation def uram_efficiency_estimation(self): """Function for URAM efficiency estimation: actual parameter storage needed divided by the allocated URAM storage (from estimation)""" @@ -290,7 +281,7 @@ def uram_efficiency_estimation(self): uram_est_capacity = uram_est * 72 * 4096 return wbits / uram_est_capacity - # TODO: FIX: worst case estimates since segmentlen is not known at this point? +# TODO: fix lut estimations def lut_estimation(self): """Calculates resource estimations for LUTs based on: - FINN-R: An End-to-End Deep-Learning Framework for Fast @@ -333,9 +324,13 @@ def lut_estimation(self): return int(c0 + c1 * (P * (mult_luts + addertree_luts + acc_luts)) + c2) - # TODO: FIX: worst case estimates since segmentlen is not known at this point? +# TODO: fix DSP estimations --> depends on fpga_part def dsp_estimation(self): # multiplication + # mvu_8sx9 (DSP58): ceil(SIMD/3) + # mvu_4sx4u (DSP48/DSP58): ceil(PE/4) + # mvu_8sx8u (DSP48): ceil(PE/2) + # mvu_lut: 0 P = self.get_nodeattr("PE") res_type = self.get_nodeattr("resType") Q = self.get_nodeattr("SIMD") @@ -349,18 +344,24 @@ def dsp_estimation(self): mult_dsp = 0 return int(mult_dsp) - # TODO: FIX: worst case estimates since segmentlen is not known at this point +# TODO: fix exp_cycles estimations --> depends on fpga_part and clk def get_exp_cycles(self): + # mvu_8sx9 (DSP58): + # 2 (replay_buffer) + ceil(chainlen/seglen) + 2 (MREG, PREG) + 2 (output reg slice) + # + MW/SIMD * MH/PE + # mvu_4sx4u (DSP48/DSP58) / mvu_8sx8u (DSP48): + # 3 (IN_REG, MREG, PREG) + 2 (replay_buffer) + 2 (output reg slice) + 1 (adder tree SIMD) + 1 (output lane) + # + MW/SIMD * MH/PE + # mvu_lut: + # 2 (replay_buffer) + 1 OR 2 (no MREG OR MREG) + 2 (output reg slice) + # + MW/SIMD * MH/PE pe = self.get_nodeattr("PE") simd = self.get_nodeattr("SIMD") num_inp_vec = self.get_nodeattr("numInputVectors") mh = self.get_nodeattr("MH") mw = self.get_nodeattr("MW") # since mmv != 1 is not supported yet, we set mmv for now to 1 - mmv = 1 - # Actual exp_cycles is probably slightly larger (say 3 cycles - # (DSP A/B, M, P - reg) + additional pipeline buffer cycles. - # Most probably <10) + mmv = 1 exp_cycles = (mh / pe) * (mw / simd) * np.prod(num_inp_vec) / mmv return int(exp_cycles) @@ -711,7 +712,7 @@ def execute_node(self, context, graph): else: raise Exception( """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( + has to be set to "rtlsim" """.format( mode ) ) @@ -795,11 +796,12 @@ def code_generation_ipi(self): os.path.join( code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v" ), - rtllib_dir + "mvu_axi.sv", + rtllib_dir + "mvu_vvu_axi.sv", rtllib_dir + "replay_buffer.sv", rtllib_dir + "mvu_4sx4u.sv", - rtllib_dir + "mvu_8sx9.sv", + rtllib_dir + "mvu_vvu_8sx9_dsp58.sv", rtllib_dir + "mvu_8sx8u_dsp48.sv", + rtllib_dir + "mvu_vvu_lut.sv", ] for f in sourcefiles: cmd.append("add_files -norecurse %s" % (f)) @@ -813,7 +815,7 @@ def code_generation_ipi(self): ) # instantiate a streamer and connect it to the HLS IP - strm_vlnv = "amd.com:FINN:memstream:1.0" + strm_vlnv = "amd.com:finn:memstream:1.0" strm_inst = node_name + "_wstrm" cmd.append( "create_bd_cell -type ip -vlnv %s /%s/%s" @@ -890,11 +892,12 @@ def code_generation_ipi(self): os.path.join( code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v" ), - rtllib_dir + "mvu_axi.sv", + rtllib_dir + "mvu_vvu_axi.sv", rtllib_dir + "replay_buffer.sv", rtllib_dir + "mvu_4sx4u.sv", - rtllib_dir + "mvu_8sx9.sv", + rtllib_dir + "mvu_vvu_8sx9_dsp58.sv", rtllib_dir + "mvu_8sx8u_dsp48.sv", + rtllib_dir + "mvu_vvu_lut.sv", ] for f in sourcefiles: cmd.append("add_files -norecurse %s" % (f)) @@ -959,27 +962,32 @@ def derive_characteristic_fxns(self, period): ] super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict) - # TODO: characterize max_clk and implement this function in look-up style def _resolve_segment_len(self, clk): - # Insert pipeline registers in the DSP chain to meet target clock frequency - return 4 # default to 4 for now + # Insert pipeline registers in the DSP58 chain to meet target clock frequency + # 0.741 ns seems the worst-case delay through first DSP + # 0.605 ns seems to be (on average) delay for all subsequent DSPs + dsp_chain_len = np.floor((clk - 0.741) / 0.605) + return max(1, dsp_chain_len) def _resolve_impl_style(self, fpgapart): # Based on target device and activation/weight-width, choose the - # supported RTL module - act_width = self.get_input_datatype(0).bitwidth() - weight_width = self.get_input_datatype(1).bitwidth() - is_versal = ( - fpgapart[0:4] in ["xcvc", "xcve", "xcvp", "xcvm", "xqvc", "xqvm"] - or fpgapart[0:5] == "xqrvc" - ) - if act_width == 4 and weight_width == 4: - return "mvu_4sx4u" + # supported RTL compute core + if self.get_nodeattr("resType") == "lut": + return "mvu_vvu_lut" else: - if is_versal: - return "mvu_8sx9_dsp58" + act_width = self.get_input_datatype(0).bitwidth() + weight_width = self.get_input_datatype(1).bitwidth() + is_versal = ( + fpgapart[0:4] in ["xcvc", "xcve", "xcvp", "xcvm", "xqvc", "xqvm"] + or fpgapart[0:5] == "xqrvc" + ) + if act_width == 4 and weight_width == 4: + return "mvu_4sx4u" else: - return "mvu_8sx8u_dsp48" + if is_versal: + return "mvu_vvu_8sx9_dsp58" + else: + return "mvu_8sx8u_dsp48" def generate_hdl(self, model, fpgapart, clk): # Generate params as part of IP preparation @@ -1023,9 +1031,11 @@ def generate_hdl(self, model, fpgapart, clk): self.set_nodeattr("ip_path", code_gen_dir) def prepare_codegen_default(self, fpgapart, clk): - template_path = os.environ["FINN_ROOT"] + "/finn-rtllib/mvu/mvu_axi_wrapper.v" + template_path = os.environ["FINN_ROOT"] + "/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v" code_gen_dict = {} + code_gen_dict["$IS_MVU$"] = [str(1)] + code_gen_dict["$COMPUTE_CORE$"] = [self._resolve_impl_style(fpgapart)] code_gen_dict["$MW$"] = [str(self.get_nodeattr("MW"))] code_gen_dict["$MH$"] = [str(self.get_nodeattr("MH"))] code_gen_dict["$PE$"] = [str(self.get_nodeattr("PE"))] @@ -1039,7 +1049,7 @@ def prepare_codegen_default(self, fpgapart, clk): [str(1)] if (self.get_input_datatype(0).min() < 0) else [str(0)] ) code_gen_dict["$SEGMENTLEN$"] = [str(self._resolve_segment_len(clk))] - code_gen_dict["$MVU_IMPL_STYLE$"] = [self._resolve_impl_style(fpgapart)] + code_gen_dict["$FORCE_BEHAVIORAL$"] = [str(0)] return template_path, code_gen_dict From b49b79a0a669caad9355e59e1ee877ca59b65d27 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 25 Sep 2023 13:47:50 +0100 Subject: [PATCH 076/123] [specialize to rtl]: fix to changed attribute name and added support for converting HLS-based VVU custom-ops to RTL-based custom-ops --- .../fpgadataflow/specialize_to_rtl_layers.py | 82 ++++++++++++++++++- 1 file changed, 81 insertions(+), 1 deletion(-) diff --git a/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py b/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py index 47ed5ce863..5061282695 100644 --- a/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py +++ b/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py @@ -26,6 +26,7 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import numpy as np from qonnx.transformation.base import Transformation from qonnx.custom_op.registry import getCustomOp from qonnx.core.datatype import DataType @@ -60,7 +61,7 @@ def apply(self, model): for n in graph.node: node_ind += 1 if n.op_type == "MatrixVectorActivation": - preferred_in_rtl = getCustomOp(n).get_nodeattr("impl") == "rtl" and getCustomOp(n).get_nodeattr("resType") == "dsp" + preferred_in_rtl = getCustomOp(n).get_nodeattr("preferred_backend") == "rtl" supported_in_rtl = self._is_rtl_variant_compatible(n) if (preferred_in_rtl and supported_in_rtl): mvau_input = n.input[0] @@ -76,6 +77,7 @@ def apply(self, model): pe = getCustomOp(n).get_nodeattr("PE") mem_mode = getCustomOp(n).get_nodeattr("mem_mode") ram_style = getCustomOp(n).get_nodeattr("ram_style") + resType = getCustomOp(n).get_nodeattr("resType") runtime_writeable_weights = getCustomOp(n).get_nodeattr("runtime_writeable_weights") new_node = helper.make_node( @@ -93,6 +95,7 @@ def apply(self, model): outputDataType=outputDataType, numInputVectors=numInputVectors, mem_mode=mem_mode, + resType=resType, name=n.name + "_rtl", ram_style=ram_style, runtime_writeable_weights=runtime_writeable_weights @@ -108,4 +111,81 @@ def apply(self, model): model = model.transform(InferDataTypes()) model = model.transform(GiveUniqueNodeNames()) + return (model, graph_modified) + +class InferRTLVectorVectorActivation(Transformation): + """Convert (HLS-based) VectorVectorActivation layers to specialized RTL layers is supported.""" + + def __init__(self): + super().__init__() + + def _is_rtl_variant_compatible(self, n): + no_activation = getCustomOp(n).get_nodeattr("noActivation") == 1 + act_width_in_range = (DataType[getCustomOp(n).get_nodeattr("inputDataType")].bitwidth() <= 8) or (DataType[getCustomOp(n).get_nodeattr("inputDataType")].bitwidth() == 9 and DataType[getCustomOp(n).get_nodeattr("inputDataType")].min() < 0) + weight_width_in_range = DataType[getCustomOp(n).get_nodeattr("weightDataType")].bitwidth() <= 8 + folding_supported = (getCustomOp(n).get_nodeattr("Channels") % getCustomOp(n).get_nodeattr("PE") == 0) and (np.prod(getCustomOp(n).get_nodeattr("Kernel")) % getCustomOp(n).get_nodeattr("SIMD") == 0) + + if (no_activation and act_width_in_range and weight_width_in_range and folding_supported): + return True + else: + return False + + def apply(self, model): + graph = model.graph + node_ind = 0 + graph_modified = False + for n in graph.node: + node_ind += 1 + if n.op_type == "VectorVectorActivation": + preferred_in_rtl = getCustomOp(n).get_nodeattr("preferred_backend") == "rtl" + supported_in_rtl = self._is_rtl_variant_compatible(n) + if (preferred_in_rtl and supported_in_rtl): + vvau_input = n.input[0] + vvau_weight = n.input[1] + vvau_output = n.output[0] + inputDataType = getCustomOp(n).get_nodeattr("inputDataType") + weightDataType = getCustomOp(n).get_nodeattr("weightDataType") + outputDataType = getCustomOp(n).get_nodeattr("outputDataType") + pe = getCustomOp(n).get_nodeattr("PE") + simd = getCustomOp(n).get_nodeattr("SIMD") + dim = getCustomOp(n).get_nodeattr("Dim") + channels = getCustomOp(n).get_nodeattr("Channels") + kernel = getCustomOp(n).get_nodeattr("Kernel") + resType = getCustomOp(n).get_nodeattr("resType") + mem_mode = getCustomOp(n).get_nodeattr("mem_mode") + runtime_writeable_weights = getCustomOp(n).get_nodeattr("runtime_writeable_weights") + ram_style = getCustomOp(n).get_nodeattr("ram_style") + resType = getCustomOp(n).get_nodeattr("resType") + + new_node = helper.make_node( + "VectorVectorActivation_rtl", + [vvau_input, vvau_weight], + [vvau_output], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + name=n.name + "_rtl", + PE=pe, + SIMD=simd, + Dim=dim, + Channels=channels, + Kernel=kernel, + resType=resType, + inputDataType=inputDataType, + weightDataType=weightDataType, + outputDataType=outputDataType, + mem_mode=mem_mode, + runtime_writeable_weights=runtime_writeable_weights, + ram_style=ram_style + ) + graph.node.insert(node_ind, new_node) + # remove old node + graph.node.remove(n) + graph_modified=True + + if graph_modified: + model = model.transform(MinimizeAccumulatorWidth()) + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + model = model.transform(GiveUniqueNodeNames()) + return (model, graph_modified) \ No newline at end of file From 9bdba031df228a2afbe99b8ea2fb576b678bba86 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= Date: Tue, 19 Sep 2023 15:27:28 +0100 Subject: [PATCH 077/123] Adding core for DSP48 backport. --- finn-rtllib/mvu/mvu_8sx8u_dsp48.sv | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv index 416c12c1cc..07c44cf89a 100644 --- a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv +++ b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv @@ -4,7 +4,9 @@ module mvu_8sx8u_dsp48 #( int unsigned ACCU_WIDTH, int unsigned ACTIVATION_WIDTH, int unsigned WEIGHT_WIDTH, - bit FORCE_BEHAVIORAL = 0, + + bit SIGNED_ACTIVATIONS = 0, + bit FORCE_BEHAVIORAL = 0, localparam int unsigned SINGLE_PROD_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH )( @@ -16,8 +18,8 @@ module mvu_8sx8u_dsp48 #( // Input input logic last, input logic zero, // ignore current inputs and force this partial product to zero - input logic signed [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] w, // signed weights - input logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] a, // unsigned activations + input logic signed [PE-1:0][SIMD-1:0][WEIGHT_WIDTH -1:0] w, // signed weights + input logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] a, // unsigned activations (override by SIGNED_ACTIVATIONS) // Ouput output logic vld, @@ -47,7 +49,7 @@ module mvu_8sx8u_dsp48 #( assign vld = L[5]; // Stages #1 - #3: DSP Lanes + cross-lane canaries duplicated with SIMD parallelism - localparam int unsigned D[2:0] = '{ ACCU_WIDTH+SINGLE_PROD_WIDTH, SINGLE_PROD_WIDTH, 0 }; // Lane offsets + localparam int unsigned D[2:0] = '{ ACCU_WIDTH+SINGLE_PROD_WIDTH, SINGLE_PROD_WIDTH, 0 }; // Lane offsets localparam int unsigned PIPE_COUNT = (PE+1)/2; for(genvar c = 0; c < PIPE_COUNT; c++) begin : genPipes @@ -61,7 +63,7 @@ module mvu_8sx8u_dsp48 #( for(genvar s = 0; s < SIMD; s++) begin : genSIMD // Input Lane Assembly - uwire [23:0] bb = a[s]; + uwire [23:0] bb = { {(24-ACTIVATION_WIDTH){SIGNED_ACTIVATIONS && a[s][ACTIVATION_WIDTH-1]}}, a[s] }; logic [33:0] aa; logic [26:0] dd; logic [ 1:0] xx; From 2cf1ef70306339b1409ed61d8e18eda243bf56ad Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 25 Sep 2023 14:48:34 +0100 Subject: [PATCH 078/123] [mvu rtl core]: added support for signed activations for DSP48-based MVUs --- finn-rtllib/mvu/mvu_4sx4u.sv | 3 ++- finn-rtllib/mvu/mvu_vvu_axi.sv | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/finn-rtllib/mvu/mvu_4sx4u.sv b/finn-rtllib/mvu/mvu_4sx4u.sv index 88985312c9..706347d700 100644 --- a/finn-rtllib/mvu/mvu_4sx4u.sv +++ b/finn-rtllib/mvu/mvu_4sx4u.sv @@ -2,6 +2,7 @@ module mvu_4sx4u #( int unsigned PE, int unsigned SIMD, int unsigned ACCU_WIDTH, + bit SIGNED_ACTIVATIONS = 0, bit FORCE_BEHAVIORAL = 0 )( // Global Control @@ -57,7 +58,7 @@ module mvu_4sx4u #( for(genvar s = 0; s < SIMD; s++) begin : genSIMD // Input Lane Assembly - uwire [23:0] bb = a[s]; + uwire [23:0] bb = { {(20){SIGNED_ACTIVATIONS && a[s][3]}}, a[s] }; logic [33:0] aa; logic [26:0] dd; logic [ 1:0] xx[3:1]; diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv index 416480da79..da7e00cc55 100644 --- a/finn-rtllib/mvu/mvu_vvu_axi.sv +++ b/finn-rtllib/mvu/mvu_vvu_axi.sv @@ -182,14 +182,14 @@ module mvu_vvu_axi #( .vld(ovld), .p(odat) ); "mvu_4sx4u": - mvu_4sx4u #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( + mvu_4sx4u #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( .clk, .rst, .en, .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i), .vld(ovld), .p(odat) ); "mvu_8sx8u_dsp48": mvu_8sx8u_dsp48 #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), - .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( + .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( .clk, .rst, .en, .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i), .vld(ovld), .p(odat) From ab8d4a8e075ac9b3ccf78d2a08907d5dcc116fdb Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 25 Sep 2023 16:17:38 +0100 Subject: [PATCH 079/123] [rtl mvu custom-op]: add upper bound to SEGMENTLEN equal to number of DSP58s chained together --- src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py index c7fb855884..d0a638475a 100644 --- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py +++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py @@ -966,7 +966,9 @@ def _resolve_segment_len(self, clk): # Insert pipeline registers in the DSP58 chain to meet target clock frequency # 0.741 ns seems the worst-case delay through first DSP # 0.605 ns seems to be (on average) delay for all subsequent DSPs - dsp_chain_len = np.floor((clk - 0.741) / 0.605) + critical_path_dsps = np.floor((clk - 0.741) / 0.605) + max_chain_len = np.ceil(self.get_nodeattr("SIMD") / 3) + dsp_chain_len = critical_path_dsps if critical_path_dsps < max_chain_len else max_chain_len return max(1, dsp_chain_len) def _resolve_impl_style(self, fpgapart): From 5a429fcbe14ca6177082fab472549407f47f97d6 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 13 Oct 2023 23:29:39 +0100 Subject: [PATCH 080/123] [mvu_vvu dsp58]: change weight input to 2D instead of 3D array --- finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv b/finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv index 6ae117e3ab..53cf71fd5f 100644 --- a/finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv +++ b/finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv @@ -42,7 +42,8 @@ module mvu_vvu_8sx9_dsp58 #( int unsigned SEGMENTLEN = 0, // Default to 0 (which implies a single segment) bit FORCE_BEHAVIORAL = 0, - localparam int unsigned ACTIVATION_ELEMENTS = (IS_MVU ? 1 : PE) * SIMD + localparam int unsigned ACTIVATION_ELEMENTS = (IS_MVU ? 1 : PE) * SIMD, + localparam int unsigned WEIGHT_ELEMENTS = PE*SIMD ) ( // Global Control @@ -53,7 +54,7 @@ module mvu_vvu_8sx9_dsp58 #( // Input input logic last, input logic zero, // ignore current inputs and force this partial product to zero - input logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] w, // weights + input logic [WEIGHT_ELEMENTS-1:0][WEIGHT_WIDTH-1:0] w, // weights input logic [ACTIVATION_ELEMENTS-1:0][ACTIVATION_WIDTH-1:0] a, // activations // Ouput @@ -164,7 +165,8 @@ module mvu_vvu_8sx9_dsp58 #( // synthesis translate_off zero ? '1 : // synthesis translate_on - w[i][3*j +: LANES_OCCUPIED]; + //w[i][3*j +: LANES_OCCUPIED]; + w[SIMD*i+3*j +: LANES_OCCUPIED]; if (EXTERNAL_PREGS > 1) B[i][0:EXTERNAL_PREGS-2] <= B[i][1:EXTERNAL_PREGS-1]; end end @@ -181,7 +183,8 @@ module mvu_vvu_8sx9_dsp58 #( // synthesis translate_off zero ? '1 : // synthesis translate_on - PAD_BITS_WEIGHT == 0 ? w[i][3*j+k] : { {PAD_BITS_WEIGHT{w[i][3*j+k][WEIGHT_WIDTH-1]}}, w[i][3*j+k] }; + //PAD_BITS_WEIGHT == 0 ? w[i][3*j+k] : { {PAD_BITS_WEIGHT{w[i][3*j+k][WEIGHT_WIDTH-1]}}, w[i][3*j+k] }; + PAD_BITS_WEIGHT == 0 ? w[SIMD*i+3*j+k] : { {PAD_BITS_WEIGHT{w[SIMD*i+3*j+k][WEIGHT_WIDTH-1]}}, w[SIMD*i+3*j+k] }; end : genBin for (genvar k=LANES_OCCUPIED; k<3; k++) begin : genBinZero assign b_in_i[i][j][8*k +: 8] = 8'b0; From a4a18bb08cef96bb52c02096d54b573b421bcd12 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 13 Oct 2023 23:30:55 +0100 Subject: [PATCH 081/123] [mvu_vvu axi]: re-wire weights appropriately for VVU DSP58 --- finn-rtllib/mvu/mvu_vvu_axi.sv | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv index da7e00cc55..f0f75c633a 100644 --- a/finn-rtllib/mvu/mvu_vvu_axi.sv +++ b/finn-rtllib/mvu/mvu_vvu_axi.sv @@ -45,7 +45,7 @@ *****************************************************************************/ module mvu_vvu_axi #( - bit IS_MVU, // string type causes error in Vivado + bit IS_MVU, parameter COMPUTE_CORE, int unsigned MW, int unsigned MH, @@ -64,8 +64,8 @@ module mvu_vvu_axi #( localparam int unsigned INPUT_STREAM_WIDTH_BA = ((IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8, localparam int unsigned WEIGHT_STREAM_WIDTH = PE*SIMD*WEIGHT_WIDTH, localparam int unsigned INPUT_STREAM_WIDTH = (IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH, - localparam int unsigned SF = MW/SIMD, - localparam int unsigned NF = MH/PE, + localparam int unsigned SF = IS_MVU ? MW/SIMD : MW/(SIMD*PE), + localparam int unsigned NF = IS_MVU ? MH/PE : 1, localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8 ) ( @@ -91,11 +91,11 @@ module mvu_vvu_axi #( //-------------------- Parameter sanity checks --------------------\\ initial begin - if (MW % SIMD != 0) begin + if ((MW % SIMD != 0 && IS_MVU) || (MW % (SIMD*PE) != 0 && !IS_MVU)) begin $error("Matrix width (%0d) is not a multiple of SIMD (%0d).", MW, SIMD); $finish; end - if (MH % PE != 0) begin + if (MH % PE != 0 && IS_MVU) begin $error("Matrix height (%0d) is not a multiple of PE (%0d).", MH, PE); $finish; end @@ -137,7 +137,7 @@ module mvu_vvu_axi #( uwire avld; uwire ardy; - replay_buffer #(.LEN(SF), .REP(IS_MVU ? NF : 1), .W($bits(mvauin_t))) activation_replay ( + replay_buffer #(.LEN(SF), .REP(NF), .W($bits(mvauin_t))) activation_replay ( .clk, .rst, .ivld(s_axis_input_tvalid), .irdy(s_axis_input_tready), .idat(mvauin_t'(s_axis_input_tdata)), .ovld(avld), .ordy(ardy), .odat(amvau), .olast(alast), .ofin(afin) @@ -154,9 +154,11 @@ module mvu_vvu_axi #( uwire [PE-1:0][ACCU_WIDTH-1:0] odat; typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t; uwire mvauin_t amvau_i; + uwire mvauin_weight_t wmvau_i; if (IS_MVU) begin : genMVUInput assign amvau_i = amvau; + assign wmvau_i = s_axis_weights_tdata; end : genMVUInput else begin : genVVUInput // The input stream will have the channels interleaved for VVU when PE>1 @@ -164,11 +166,14 @@ module mvu_vvu_axi #( // Note that for each 'SIMD' (S) and 'PE' (P) element, we have something like: // (S_0, P_0), ..., (S_0, P_i), (S_1, P_0), ..., (S_1, P_i), ..., (S_i, P_i) which we need to 'untangle' to // (S_0, P_0), ..., (S_i, P_0), (S_0, P_1), ..., (S_i,, P_1), ..., (S_i, P_i) - localparam int num_of_elements = INPUT_STREAM_WIDTH/ACTIVATION_WIDTH; + localparam int num_of_elements = PE*SIMD; for (genvar i=0; i 1) ? amvau[(i/SIMD + (i*PE % num_of_elements) + 1) * ACTIVATION_WIDTH : (i/SIMD + (i*PE % num_of_elements)) * ACTIVATION_WIDTH] : amvau[i*ACTIVATION_WIDTH +: ACTIVATION_WIDTH]; + assign wmvau_i[i*WEIGHT_WIDTH +: WEIGHT_WIDTH] = (PE > 1) ? + s_axis_weights_tdata[( ((SIMD-1-i) + int'(i/SIMD)*SIMD) + int'(i/SIMD) * SIMD + 1) * WEIGHT_WIDTH : ( ((SIMD-1-i) + int'(i/SIMD)*SIMD) + int'(i/SIMD) * SIMD ) * WEIGHT_WIDTH] + : s_axis_weights_tdata[i*WEIGHT_WIDTH +: WEIGHT_WIDTH]; end : genRewire end : genVVUInput @@ -178,7 +183,7 @@ module mvu_vvu_axi #( .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( .clk, .rst, .en, - .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i), + .last(alast && avld), .zero(!istb), .w(wmvau_i), .a(amvau_i), .vld(ovld), .p(odat) ); "mvu_4sx4u": From cc0737bcd00cdd6df6e3d4ff38215ac5d9eb42e6 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 13 Oct 2023 23:31:35 +0100 Subject: [PATCH 082/123] [mvu_vvu axi wrapper]: fix to IS_MVU parameter --- finn-rtllib/mvu/mvu_vvu_axi_wrapper.v | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v index 9c65dbc06e..01deb23840 100644 --- a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v +++ b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v @@ -32,7 +32,7 @@ *****************************************************************************/ module $MODULE_NAME_AXI_WRAPPER$ #( - parameter IS_MVU = "$IS_MVU$", + parameter IS_MVU = $IS_MVU$, parameter COMPUTE_CORE = "$COMPUTE_CORE$", parameter MW = $MW$, parameter MH = $MH$, From c0eff0b819828a5e1d1ef80815f63be0042ce742 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 13 Oct 2023 23:32:47 +0100 Subject: [PATCH 083/123] [mvu_vvu tb]: WIP -- changes to self-checker and shape of input data --- finn-rtllib/mvu/tb/mvu_vvu_axi_tb.sv | 79 +++++++++++++++++----------- 1 file changed, 49 insertions(+), 30 deletions(-) diff --git a/finn-rtllib/mvu/tb/mvu_vvu_axi_tb.sv b/finn-rtllib/mvu/tb/mvu_vvu_axi_tb.sv index 82c2e8e7b0..b46fc588c9 100644 --- a/finn-rtllib/mvu/tb/mvu_vvu_axi_tb.sv +++ b/finn-rtllib/mvu/tb/mvu_vvu_axi_tb.sv @@ -35,23 +35,23 @@ module mvu_vvu_axi_tb(); //-------------------- Simulation parameters --------------------\\ // Matrix & parallelism config - localparam bit IS_MVU = 1; + localparam bit IS_MVU = 0; localparam string COMPUTE_CORE = "mvu_vvu_8sx9_dsp58"; - localparam int unsigned MW = 1500; - localparam int unsigned MH = 256; - localparam int unsigned SIMD = 60; - localparam int unsigned PE = 16; - localparam int unsigned SEGMENTLEN = 2.0; + localparam int unsigned MW = 36; + localparam int unsigned MH = 1; + localparam int unsigned SIMD = 3; + localparam int unsigned PE = 4; + localparam int unsigned SEGMENTLEN = 1.0; localparam bit FORCE_BEHAVIORAL = 1; localparam bit M_REG_LUT = 1; // Bit-width config - localparam int unsigned ACTIVATION_WIDTH = 4; - localparam int unsigned WEIGHT_WIDTH = 4; - localparam int unsigned ACCU_WIDTH = 21; // == ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW) - localparam bit SIGNED_ACTIVATIONS = 0; + localparam int unsigned ACTIVATION_WIDTH = 8; + localparam int unsigned WEIGHT_WIDTH = 6; + localparam int unsigned ACCU_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW); + localparam bit SIGNED_ACTIVATIONS = 1; // Simulation constants - localparam int unsigned NF = MH/PE; - localparam int unsigned SF = MW/SIMD; + localparam int unsigned NF = IS_MVU ? MH/PE : 1; + localparam int unsigned SF = IS_MVU ? MW/SIMD : MW/(SIMD*PE); localparam int unsigned WEIGHT_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8*8; localparam int unsigned ACTIVATION_WIDTH_BA = ((IS_MVU ? 1 : PE)*SIMD*ACTIVATION_WIDTH+7)/8*8; localparam int unsigned WEIGHT_WIDTH_BA_DELTA = WEIGHT_WIDTH_BA - PE*SIMD*WEIGHT_WIDTH; @@ -72,7 +72,7 @@ module mvu_vvu_axi_tb(); // Generate activations typedef logic [(IS_MVU ? 1 : PE)*SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t; - typedef activation_t activation_vector_t[(IS_MVU ? 1 : NF)*SF]; + typedef activation_t activation_vector_t[SF]; function activation_vector_t init_ACTIVATIONS; automatic activation_vector_t res; @@ -93,14 +93,12 @@ module mvu_vvu_axi_tb(); activations.dat = 'X; @(posedge clk iff ap_rst_n); - for (int j=0; j<(IS_MVU ? 1 : NF); j++) begin - for (int i=0; i= 0; - @(posedge clk); - end while (!(activations.vld === 1 && activations.rdy === 1)); - end + for (int i=0; i= 0; + @(posedge clk); + end while (!(activations.vld === 1 && activations.rdy === 1)); end activations.vld <= 0; @@ -143,7 +141,9 @@ module mvu_vvu_axi_tb(); end // Function to compute golden output - // a: [(IS_MVU?1:NF)*SF][SIMD-1:0][ACTIVATION_WIDTH-1:0] + // a: [SF][(IS_MVU ? 1 : PE)*SIMD-1:0][ACTIVATION_WIDTH-1:0] + // a: [SF][SIMD-1:0][ACTIVATION_WIDTH-1:0] + // a: [SF][PE*SIMD-1:0][ACTIVATION_WIDTH-1:0] // w: [NF][SF][PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] typedef logic signed [PE-1:0][ACCU_WIDTH-1:0] output_t; typedef output_t output_vector_t [NF]; @@ -156,14 +156,33 @@ module mvu_vvu_axi_tb(); function output_vector_t check_output(activation_vector_t a, weight_matrix_t w); automatic output_vector_t res = '{default: 0}; - for (int j = 0; j 1 ? $signed(a[j/PE*SF+i/SIMD][(i*PE + j%PE) % (SIMD*PE)]) : $signed(a[j/PE*SF+i/SIMD][i%SIMD]) ) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]); - else - res[j/PE][j%PE] = IS_MVU ? $signed(res[j/PE][j%PE]) + $signed({1'b0, a[i/SIMD][i%SIMD]}) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]) : - $signed(res[j/PE][j%PE]) + ( PE > 1 ? $signed({1'b0, a[j/PE*SF+i/SIMD][(i*PE + j%PE) % (SIMD*PE)]}) : $signed({1'b0, a[j/PE+SF+i/SIMD][i%SIMD]}) ) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]); + // for (int j = 0; j 1 ? $signed(a[i/SIMD/PE][i % (SIMD*PE)]) : $signed(a[i/SIMD/PE][(i)%(SIMD*PE)]) ) * $signed(w[0][i/SIMD/PE][i/PE][i%SIMD]); + // else + // res[j/PE][j%PE] = IS_MVU ? $signed(res[j/PE][j%PE]) + $signed({1'b0, a[i/SIMD][i%SIMD]}) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]) : + // $signed(res[j/PE][j%PE]) + ( PE > 1 ? $signed({1'b0, a[i/SIMD/PE][i % (SIMD*PE)]}) : $signed({1'b0, a[i/SIMD/PE][i%(SIMD*PE)]}) ) * $signed(w[0][i/SIMD][0][i%SIMD]); + // end + // end + // The input stream will have the channels interleaved for VVU when PE>1 + // Hence, we need to 'untangle' the input stream, i.e. [..][SIMD*PE][..] --> [..][PE][SIMD][..] + // Note that for each 'SIMD' (S) and 'PE' (P) element, we have something like: + // (S_0, P_0), ..., (S_0, P_i), (S_1, P_0), ..., (S_1, P_i), ..., (S_i, P_i) which we need to 'untangle' to + // (S_0, P_0), ..., (S_i, P_0), (S_0, P_1), ..., (S_i,, P_1), ..., (S_i, P_i) + for (int i = 0; i < NF; i++) begin + for (int j = 0; j < SF; j++) begin + for (int k = 0; k < PE; k++) begin + for (int l = 0; l < SIMD; l++) begin + if (SIGNED_ACTIVATIONS) + res[i][k] = IS_MVU ? $signed(res[i][k]) + $signed(a[j][l]) * $signed(w[i][j][k][l]) : + $signed(res[i][k]) + $signed(a[j][k + l*PE]) * $signed(w[i][j][k][l]); + else + res[i][k] = IS_MVU ? $signed(res[i][k]) + $signed({1'b0, a[j][l]}) * $signed(w[i][j][k][l]) : + $signed(res[i][k]) + $signed({1'b0, a[j][k + l*PE]}) * $signed(w[i][j][k][l]); + end + end end end return res; From cf7f4946dc44f264de665e8a23893bd858277796 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Wed, 1 Nov 2023 15:20:07 +0000 Subject: [PATCH 084/123] [mvu vvu axi]: minor bugfixes to enable VVU --- finn-rtllib/mvu/mvu_vvu_axi.sv | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv index f0f75c633a..ddedec1e8a 100644 --- a/finn-rtllib/mvu/mvu_vvu_axi.sv +++ b/finn-rtllib/mvu/mvu_vvu_axi.sv @@ -64,7 +64,7 @@ module mvu_vvu_axi #( localparam int unsigned INPUT_STREAM_WIDTH_BA = ((IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8, localparam int unsigned WEIGHT_STREAM_WIDTH = PE*SIMD*WEIGHT_WIDTH, localparam int unsigned INPUT_STREAM_WIDTH = (IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH, - localparam int unsigned SF = IS_MVU ? MW/SIMD : MW/(SIMD*PE), + localparam int unsigned SF = MW/SIMD, localparam int unsigned NF = IS_MVU ? MH/PE : 1, localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8 ) @@ -91,11 +91,11 @@ module mvu_vvu_axi #( //-------------------- Parameter sanity checks --------------------\\ initial begin - if ((MW % SIMD != 0 && IS_MVU) || (MW % (SIMD*PE) != 0 && !IS_MVU)) begin + if (MW % SIMD != 0) begin $error("Matrix width (%0d) is not a multiple of SIMD (%0d).", MW, SIMD); $finish; end - if (MH % PE != 0 && IS_MVU) begin + if (MH % PE != 0) begin $error("Matrix height (%0d) is not a multiple of PE (%0d).", MH, PE); $finish; end @@ -152,13 +152,10 @@ module mvu_vvu_axi #( //-------------------- Core MVU/VVU --------------------\\ uwire ovld; uwire [PE-1:0][ACCU_WIDTH-1:0] odat; - typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t; uwire mvauin_t amvau_i; - uwire mvauin_weight_t wmvau_i; if (IS_MVU) begin : genMVUInput assign amvau_i = amvau; - assign wmvau_i = s_axis_weights_tdata; end : genMVUInput else begin : genVVUInput // The input stream will have the channels interleaved for VVU when PE>1 @@ -169,11 +166,8 @@ module mvu_vvu_axi #( localparam int num_of_elements = PE*SIMD; for (genvar i=0; i 1) ? - amvau[(i/SIMD + (i*PE % num_of_elements) + 1) * ACTIVATION_WIDTH : (i/SIMD + (i*PE % num_of_elements)) * ACTIVATION_WIDTH] + amvau[(i/SIMD + (i*PE % num_of_elements) + 1) * ACTIVATION_WIDTH -1: (i/SIMD + (i*PE % num_of_elements)) * ACTIVATION_WIDTH] : amvau[i*ACTIVATION_WIDTH +: ACTIVATION_WIDTH]; - assign wmvau_i[i*WEIGHT_WIDTH +: WEIGHT_WIDTH] = (PE > 1) ? - s_axis_weights_tdata[( ((SIMD-1-i) + int'(i/SIMD)*SIMD) + int'(i/SIMD) * SIMD + 1) * WEIGHT_WIDTH : ( ((SIMD-1-i) + int'(i/SIMD)*SIMD) + int'(i/SIMD) * SIMD ) * WEIGHT_WIDTH] - : s_axis_weights_tdata[i*WEIGHT_WIDTH +: WEIGHT_WIDTH]; end : genRewire end : genVVUInput @@ -183,7 +177,7 @@ module mvu_vvu_axi #( .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( .clk, .rst, .en, - .last(alast && avld), .zero(!istb), .w(wmvau_i), .a(amvau_i), + .last(alast && avld), .zero(!istb), .w(s_axis_weights_tdata), .a(amvau_i), .vld(ovld), .p(odat) ); "mvu_4sx4u": From 5ffc221eaa07828001e423551ad05f8207178656 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 20 Nov 2023 14:35:45 +0000 Subject: [PATCH 085/123] [mvu vvu axi]: minor fix -- define mvauin_weight_t --- finn-rtllib/mvu/mvu_vvu_axi.sv | 1 + 1 file changed, 1 insertion(+) diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv index ddedec1e8a..8eb92a93e6 100644 --- a/finn-rtllib/mvu/mvu_vvu_axi.sv +++ b/finn-rtllib/mvu/mvu_vvu_axi.sv @@ -153,6 +153,7 @@ module mvu_vvu_axi #( uwire ovld; uwire [PE-1:0][ACCU_WIDTH-1:0] odat; uwire mvauin_t amvau_i; + typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t; if (IS_MVU) begin : genMVUInput assign amvau_i = amvau; From 40d652ccb817295e5668ed765f8e348346584465 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Wed, 29 Nov 2023 14:02:33 +0000 Subject: [PATCH 086/123] [rtl mvu op]: minor fix to chain length estimation and enabled behavioral mode for rtl sim --- .../custom_op/fpgadataflow/matrixvectoractivation_rtl.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py index d0a638475a..da560d73fd 100644 --- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py +++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py @@ -966,10 +966,12 @@ def _resolve_segment_len(self, clk): # Insert pipeline registers in the DSP58 chain to meet target clock frequency # 0.741 ns seems the worst-case delay through first DSP # 0.605 ns seems to be (on average) delay for all subsequent DSPs - critical_path_dsps = np.floor((clk - 0.741) / 0.605) + # clk >= (critical_path_dsps - 1) * 0.605 + 0.741 + assert (clk > 0.741), "Infeasible clk target of {} ns has been set, consider lowering the targeted clock frequency!".format(clk) + critical_path_dsps = np.floor((clk - 0.741) / 0.605 + 1) max_chain_len = np.ceil(self.get_nodeattr("SIMD") / 3) dsp_chain_len = critical_path_dsps if critical_path_dsps < max_chain_len else max_chain_len - return max(1, dsp_chain_len) + return dsp_chain_len def _resolve_impl_style(self, fpgapart): # Based on target device and activation/weight-width, choose the @@ -1051,7 +1053,6 @@ def prepare_codegen_default(self, fpgapart, clk): [str(1)] if (self.get_input_datatype(0).min() < 0) else [str(0)] ) code_gen_dict["$SEGMENTLEN$"] = [str(self._resolve_segment_len(clk))] - code_gen_dict["$FORCE_BEHAVIORAL$"] = [str(0)] return template_path, code_gen_dict From 6e98bac42f225e7ed8629e0cb67211e78db61d15 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Wed, 13 Dec 2023 09:36:25 +0000 Subject: [PATCH 087/123] [rtlsim]: use pyverilator util functions --- src/finn/custom_op/fpgadataflow/hlscustomop.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/hlscustomop.py b/src/finn/custom_op/fpgadataflow/hlscustomop.py index 4fed8ed4b5..01b94c20ca 100644 --- a/src/finn/custom_op/fpgadataflow/hlscustomop.py +++ b/src/finn/custom_op/fpgadataflow/hlscustomop.py @@ -31,7 +31,7 @@ import subprocess import warnings from abc import abstractmethod -from pyverilator.util.axi_utils import _read_signal, reset_rtlsim, rtlsim_multi_io +from pyverilator.util.axi_utils import _read_signal, reset_rtlsim, rtlsim_multi_io, toggle_clk from qonnx.core.datatype import DataType from qonnx.custom_op.base import CustomOp from qonnx.util.basic import roundup_to_integer_multiple @@ -491,15 +491,11 @@ def exec_precompiled_singlenode_model(self): def reset_rtlsim(self, sim): """Sets reset input in pyverilator to zero, toggles the clock and set it back to one""" - sim.io.ap_rst_n = 0 - sim.io.ap_clk = 1 - sim.io.ap_clk = 0 - sim.io.ap_rst_n = 1 + reset_rtlsim(sim) def toggle_clk(self, sim): """Toggles the clock input in pyverilator once.""" - sim.io.ap_clk = 1 - sim.io.ap_clk = 0 + toggle_clk(sim) def hls_sname(self): """Get the naming convention used by Vitis HLS for stream signals From 5dd74ad1dede3bf2a0405de8c803a4adfb2e65d3 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 8 Dec 2023 17:12:42 +0000 Subject: [PATCH 088/123] [mvu vvu axi]: sign extend output tdata (byte-aligned) --- finn-rtllib/mvu/mvu_vvu_axi.sv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv index 8eb92a93e6..699662bd72 100644 --- a/finn-rtllib/mvu/mvu_vvu_axi.sv +++ b/finn-rtllib/mvu/mvu_vvu_axi.sv @@ -243,6 +243,6 @@ module mvu_vvu_axi #( end assign m_axis_output_tvalid = B.vld; - assign m_axis_output_tdata = B.dat; + assign m_axis_output_tdata = { {(OUTPUT_STREAM_WIDTH_BA-OUTPUT_STREAM_WIDTH){B.dat[PE-1][ACCU_WIDTH-1]}}, B.dat}; endmodule : mvu_vvu_axi From b20410bfd968c27395537b60bba11849b599a33a Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 8 Jan 2024 14:55:56 +0000 Subject: [PATCH 089/123] [mvu core]: dsp48 convert unpacked array to packed array to work around limitation on max array indices in Verilator --- finn-rtllib/mvu/mvu_4sx4u.sv | 4 ++-- finn-rtllib/mvu/mvu_8sx8u_dsp48.sv | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/finn-rtllib/mvu/mvu_4sx4u.sv b/finn-rtllib/mvu/mvu_4sx4u.sv index 706347d700..7a2af35742 100644 --- a/finn-rtllib/mvu/mvu_4sx4u.sv +++ b/finn-rtllib/mvu/mvu_4sx4u.sv @@ -309,7 +309,7 @@ module mvu_4sx4u #( // Conclusive high part accumulation if(i >= PE_REM && i < 3) begin : genHi // Adder Tree across all SIMD high contributions, each from [-1:1] - uwire signed [$clog2(1+SIMD):0] tree[2*SIMD-1]; + uwire signed [2*SIMD-2:0][$clog2(1+SIMD):0] tree; for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = h3[s][i]; for(genvar n = 0; n < SIMD-1; n++) begin // Sum truncated to actual maximum bit width at this node @@ -333,7 +333,7 @@ module mvu_4sx4u #( if(i >= PE_REM) begin : blkLo // Adder Tree across all SIMD low contributions localparam int unsigned ROOT_WIDTH = $clog2(1 + SIMD*(2**LO_WIDTH-1)); - uwire [ROOT_WIDTH-1:0] tree[2*SIMD-1]; + uwire [2*SIMD-2:0][ROOT_WIDTH-1:0] tree; for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = p3[s][D[i]+:LO_WIDTH]; for(genvar n = 0; n < SIMD-1; n++) begin // Sum truncated to actual maximum bit width at this node diff --git a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv index 07c44cf89a..1e6855f779 100644 --- a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv +++ b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv @@ -335,7 +335,7 @@ module mvu_8sx8u_dsp48 #( if(i >= PE_REM) begin : blkLo // Adder Tree across all SIMD low contributions localparam int unsigned ROOT_WIDTH = $clog2(1 + SIMD*(2**LO_WIDTH-1)); - uwire [ROOT_WIDTH-1:0] tree[2*SIMD-1]; + uwire [2*SIMD-2:0][ROOT_WIDTH-1:0] tree; for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = p3[s][D[i]+:LO_WIDTH]; for(genvar n = 0; n < SIMD-1; n++) begin // Sum truncated to actual maximum bit width at this node From 1c2cc0c2c1d98d7cde569f65eb20873a10e1f12f Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 8 Jan 2024 14:57:19 +0000 Subject: [PATCH 090/123] [mvu axi]: update list of deduced parameters --- finn-rtllib/mvu/mvu_vvu_axi.sv | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv index 699662bd72..dd357c94bb 100644 --- a/finn-rtllib/mvu/mvu_vvu_axi.sv +++ b/finn-rtllib/mvu/mvu_vvu_axi.sv @@ -60,13 +60,14 @@ module mvu_vvu_axi #( bit M_REG_LUT = 1, // Safely deducible parameters - localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8, - localparam int unsigned INPUT_STREAM_WIDTH_BA = ((IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8, - localparam int unsigned WEIGHT_STREAM_WIDTH = PE*SIMD*WEIGHT_WIDTH, - localparam int unsigned INPUT_STREAM_WIDTH = (IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH, - localparam int unsigned SF = MW/SIMD, - localparam int unsigned NF = IS_MVU ? MH/PE : 1, - localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8 + localparam int unsigned WEIGHT_STREAM_WIDTH = PE * SIMD * WEIGHT_WIDTH, + localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (WEIGHT_STREAM_WIDTH + 7) / 8 * 8, + localparam int unsigned INPUT_STREAM_WIDTH = (IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH, + localparam int unsigned INPUT_STREAM_WIDTH_BA = (INPUT_STREAM_WIDTH + 7) / 8 * 8, + localparam int unsigned OUTPUT_STREAM_WIDTH = PE * ACCU_WIDTH, + localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (OUTPUT_STREAM_WIDTH + 7) / 8 * 8, + localparam int unsigned SF = MW / SIMD, + localparam int unsigned NF = IS_MVU ? MH / PE : 1 ) ( // Global Control From eeb3cea623865a13d8da78acb5a9c7fc621caf0e Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 8 Jan 2024 14:58:02 +0000 Subject: [PATCH 091/123] [mvu custom-op]: remove lut-based implementation and update compute core selection --- .../matrixvectoractivation_rtl.py | 39 ++++++++++--------- 1 file changed, 21 insertions(+), 18 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py index da560d73fd..fcab06658c 100644 --- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py +++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py @@ -191,7 +191,12 @@ def verify_node(self): if mem_mode not in ["decoupled", "external"]: info_messages.append( - "RTL-based MVAU supports only decoupled or external weights." + "RTL-based MVU only supports decoupled or external weights." + ) + + if self.get_nodeattr("resType") == "lut": + info_message.append( + "RTL-based MVU only supports DSP-based implementation" ) return info_messages @@ -635,7 +640,6 @@ def execute_node(self, context, graph): mem_mode = self.get_nodeattr("mem_mode") node = self.onnx_node - # TODO ensure codegen dir exists if mode == "cppsim": raise Exception( "cppsim not possible for RTL MVAU, please set exec_mode to rtlsim" @@ -801,7 +805,6 @@ def code_generation_ipi(self): rtllib_dir + "mvu_4sx4u.sv", rtllib_dir + "mvu_vvu_8sx9_dsp58.sv", rtllib_dir + "mvu_8sx8u_dsp48.sv", - rtllib_dir + "mvu_vvu_lut.sv", ] for f in sourcefiles: cmd.append("add_files -norecurse %s" % (f)) @@ -897,7 +900,6 @@ def code_generation_ipi(self): rtllib_dir + "mvu_4sx4u.sv", rtllib_dir + "mvu_vvu_8sx9_dsp58.sv", rtllib_dir + "mvu_8sx8u_dsp48.sv", - rtllib_dir + "mvu_vvu_lut.sv", ] for f in sourcefiles: cmd.append("add_files -norecurse %s" % (f)) @@ -964,8 +966,8 @@ def derive_characteristic_fxns(self, period): def _resolve_segment_len(self, clk): # Insert pipeline registers in the DSP58 chain to meet target clock frequency - # 0.741 ns seems the worst-case delay through first DSP - # 0.605 ns seems to be (on average) delay for all subsequent DSPs + # ~0.741 ns seems the worst-case delay through first DSP + # ~0.605 ns seems to be (on average) delay for all subsequent DSPs # clk >= (critical_path_dsps - 1) * 0.605 + 0.741 assert (clk > 0.741), "Infeasible clk target of {} ns has been set, consider lowering the targeted clock frequency!".format(clk) critical_path_dsps = np.floor((clk - 0.741) / 0.605 + 1) @@ -976,22 +978,23 @@ def _resolve_segment_len(self, clk): def _resolve_impl_style(self, fpgapart): # Based on target device and activation/weight-width, choose the # supported RTL compute core - if self.get_nodeattr("resType") == "lut": - return "mvu_vvu_lut" + + assert self.get_nodeattr("resType") != "lut", "LUT-based RTL-MVU implementation currently not supported! Please change resType for {}".format(self.onnx_node.name) + + act_width = self.get_input_datatype(0).bitwidth() + weight_width = self.get_input_datatype(1).bitwidth() + is_versal = ( + fpgapart[0:4] in ["xcvc", "xcve", "xcvp", "xcvm", "xqvc", "xqvm"] + or fpgapart[0:5] == "xqrvc" + ) + + if is_versal: + return "mvu_vvu_8sx9_dsp58" else: - act_width = self.get_input_datatype(0).bitwidth() - weight_width = self.get_input_datatype(1).bitwidth() - is_versal = ( - fpgapart[0:4] in ["xcvc", "xcve", "xcvp", "xcvm", "xqvc", "xqvm"] - or fpgapart[0:5] == "xqrvc" - ) if act_width == 4 and weight_width == 4: return "mvu_4sx4u" else: - if is_versal: - return "mvu_vvu_8sx9_dsp58" - else: - return "mvu_8sx8u_dsp48" + return "mvu_8sx8u_dsp48" def generate_hdl(self, model, fpgapart, clk): # Generate params as part of IP preparation From 0813d1463a219384b4666fad2db93a4f7dee1a0f Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 8 Jan 2024 14:59:30 +0000 Subject: [PATCH 092/123] [mvu axi]: remove LUT-based compute core --- finn-rtllib/mvu/mvu_vvu_axi.sv | 11 +--- finn-rtllib/mvu/mvu_vvu_lut.sv | 104 --------------------------------- 2 files changed, 2 insertions(+), 113 deletions(-) delete mode 100644 finn-rtllib/mvu/mvu_vvu_lut.sv diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv index dd357c94bb..a3b051c9a1 100644 --- a/finn-rtllib/mvu/mvu_vvu_axi.sv +++ b/finn-rtllib/mvu/mvu_vvu_axi.sv @@ -120,8 +120,8 @@ module mvu_vvu_axi #( end end if (!IS_MVU) begin - if (COMPUTE_CORE != "mvu_vvu_8sx9_dsp58" && COMPUTE_CORE != "mvu_vvu_lut") begin - $error("VVU only supported on DSP58 or LUT-based implementation"); + if (COMPUTE_CORE != "mvu_vvu_8sx9_dsp58") begin + $error("VVU only supported on DSP58"); $finish; end end @@ -195,13 +195,6 @@ module mvu_vvu_axi #( .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i), .vld(ovld), .p(odat) ); - "mvu_vvu_lut": - mvu_vvu_lut #(.IS_MVU(IS_MVU), .PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), - .WEIGHT_WIDTH(WEIGHT_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .M_REG(M_REG_LUT)) core ( - .clk, .rst, .en, - .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i), - .vld(ovld), .p(odat) - ); default: initial begin $error("Unrecognized COMPUTE_CORE '%s'", COMPUTE_CORE); $finish; diff --git a/finn-rtllib/mvu/mvu_vvu_lut.sv b/finn-rtllib/mvu/mvu_vvu_lut.sv deleted file mode 100644 index c100910d75..0000000000 --- a/finn-rtllib/mvu/mvu_vvu_lut.sv +++ /dev/null @@ -1,104 +0,0 @@ -module mvu_vvu_lut #( - bit IS_MVU, - int unsigned PE, - int unsigned SIMD, - int unsigned ACCU_WIDTH, - int unsigned ACTIVATION_WIDTH, - int unsigned WEIGHT_WIDTH, - bit SIGNED_ACTIVATIONS, - bit M_REG = 1, - - localparam int unsigned MULT_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH, - localparam int unsigned ACTIVATION_ELEMENTS = (IS_MVU ? 1 : PE) * SIMD -)( - // Global Control - input logic clk, - input logic rst, - input logic en, - - // Input - input logic last, - input logic zero, // ignore current inputs and force this partial product to zero - input logic signed [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] w, // signed weights - input logic [ACTIVATION_ELEMENTS-1:0][ACTIVATION_WIDTH-1:0] a, // (un)signed activations - - // Ouput - output logic vld, - output logic signed [PE-1:0][ACCU_WIDTH-1:0] p -); - - typedef int unsigned leave_load_t[2*SIMD-1]; - function leave_load_t init_leave_loads(); - automatic leave_load_t res; - for(int i = 2*(SIMD-1); i >= int'(SIMD)-1; i--) res[i] = 1; - for(int i = SIMD-2; i >= 0; i--) res[i] = res[2*i+1] + res[2*i+2]; - return res; - endfunction : init_leave_loads - - // Pipeline for last indicator flag - uwire last_i; - generate if (M_REG) begin - logic [0:1] L = '0; - always_ff @(posedge clk) begin - if(rst) L <= '0; - else if (en) L <= {last, L[0]}; - end - assign last_i = L[1]; - end - else begin - logic L = '0; - always_ff @(posedge clk) begin - if(rst) L <= '0; - else if (en) L <= last; - end - assign last_i = L; - end - endgenerate - - // For each PE generate - for (genvar i = 0; i < PE; i++) begin : genPE - // Stage #1: SIMD multipliers in parallel - uwire [MULT_WIDTH-1 : 0] m1 [SIMD]; - for (genvar j = 0; j < SIMD; j++) begin : genSIMD - if (M_REG) begin : genMreg - logic [MULT_WIDTH-1 : 0] M [SIMD]; - always_ff @(posedge clk) begin - if(rst) M[j] = '{ default : 0 }; - else if (en) M[j] = zero ? 0 : - SIGNED_ACTIVATIONS ? $signed(a[(IS_MVU ? 0 : SIMD*i) + j]) * $signed(w[i][j]) : - $signed({1'b0, a[(IS_MVU ? 0 : SIMD*i) + j]}) * $signed(w[i][j]); - // (SIGNED_ACTIVATIONS ? $signed(a[(IS_MVU ? 0 : SIMD*i) + j]) : a[(IS_MVU ? 0 : SIMD*i) + j]) * $signed(w[i][j]) isn't valid -- leads to unsigned multiplication - end - assign m1[j] = M[j]; - end : genMreg - else begin : genNoMreg - assign m1[j] = zero ? 0 : - SIGNED_ACTIVATIONS ? $signed(a[(IS_MVU ? 0 : SIMD*i) + j]) * $signed(w[i][j]) : - $signed({1'b0, a[(IS_MVU ? 0 : SIMD*i) + j]}) * $signed(w[i][j]); - end : genNoMreg - end : genSIMD - - // Stage #2: Adder tree to reduce SIMD products - localparam leave_load_t LEAVE_LOAD = SIMD > 1 ? init_leave_loads() : '{ default : 1 }; - localparam int unsigned ROOT_WIDTH = $clog2(SIMD*(2**MULT_WIDTH-1)); - uwire signed [2*SIMD-2:0][ROOT_WIDTH-1:0] tree; - for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = $signed(m1[s]); - for(genvar n = 0; n < SIMD-1; n++) begin - // Sum truncated to actual maximum bit width at this node - localparam int unsigned NODE_WIDTH = $clog2(LEAVE_LOAD[n]*(2**MULT_WIDTH-1)); - uwire signed [NODE_WIDTH-1:0] s = tree[2*n+1] + tree[2*n+2]; - assign tree[n] = s; - end - - // Stage #3: Buffer output - logic [ACCU_WIDTH-1:0] P2 [PE]; - always_ff @(posedge clk) begin - if(rst) P2[i] = '{ default : 0}; - else if (en) P2[i] = (last_i ? 0 : $signed(P2[i])) + $signed(tree[0]); - end - - assign vld = last_i; - assign p[i] = P2[i]; - end : genPE - -endmodule : mvu_vvu_lut From 4892d6614b734a08315062b86ec6d5e1f1af0dc1 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 11 Jan 2024 12:02:38 +0000 Subject: [PATCH 093/123] [hls custom-op]: enable reset in sim --- src/finn/custom_op/fpgadataflow/hlscustomop.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/finn/custom_op/fpgadataflow/hlscustomop.py b/src/finn/custom_op/fpgadataflow/hlscustomop.py index 01b94c20ca..bc59c69192 100644 --- a/src/finn/custom_op/fpgadataflow/hlscustomop.py +++ b/src/finn/custom_op/fpgadataflow/hlscustomop.py @@ -600,6 +600,7 @@ def rtlsim_multi_io(self, sim, io_dict): trace_file=trace_file, sname=sname, liveness_threshold=pyverilate_get_liveness_threshold_cycles(), + do_reset=True, ) self.set_nodeattr("cycles_rtlsim", total_cycle_count) From 44f6e0f3e70eea06408b94a31e555f0f6b9ea358 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 11 Jan 2024 12:21:00 +0000 Subject: [PATCH 094/123] [test mvu rtl]: updated test flow (DSP58 only) --- .../test_fpgadataflow_mvau_rtl.py | 167 +++++++++--------- 1 file changed, 87 insertions(+), 80 deletions(-) diff --git a/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py b/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py index 3db7a718f5..1e9de44fb2 100644 --- a/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py +++ b/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py @@ -27,141 +27,148 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import pytest -import os import numpy as np +import os +import pickle from onnx import TensorProto, helper -from qonnx.util.basic import ( - qonnx_make_model, - gen_finn_dt_tensor -) -from qonnx.core.modelwrapper import ModelWrapper from qonnx.core.datatype import DataType -from qonnx.transformation.general import GiveUniqueNodeNames +from qonnx.core.modelwrapper import ModelWrapper +from qonnx.custom_op.registry import getCustomOp +from qonnx.transformation.general import ApplyConfig, GiveUniqueNodeNames, GiveReadableTensorNames +from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model + import finn.core.onnx_exec as oxe import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls -from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +import finn.transformation.fpgadataflow.specialize_to_rtl_layers as to_rtl +from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP -from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +from finn.transformation.fpgadataflow.set_fifo_depths import InsertAndSetFIFODepths + + from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim -from qonnx.transformation.general import ApplyConfig -import finn.transformation.fpgadataflow.specialize_to_rtl_layers as to_rtl -#import qonnx.core.data_layout as DataLayout +from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode build_dir = os.environ["FINN_BUILD_DIR"] -def make_single_matmul_modelwrapper(W, ofm_shape, mh, ifm, weights, idt, wdt): - (ofm_h, ofm_w) = ofm_shape - ofm = helper.make_tensor_value_info( - "ofm", - TensorProto.FLOAT, - (1, ofm_h, ofm_w, mh) - ) - - matmul_node = helper.make_node( - "MatMul", - ["ifm", "weights"], - ["ofm"] - ) - graph = helper.make_graph( - nodes=[matmul_node], - name="matmul_graph", - inputs=[ifm], - outputs=[ofm] - ) + +def make_single_matmul_modelwrapper(ifm, ofm, idt, wdt, W): + matmul_node = helper.make_node("MatMul", ["ifm", "weights"], ["ofm"]) + graph = helper.make_graph(nodes=[matmul_node], name="matmul_graph", inputs=[ifm], outputs=[ofm]) model = qonnx_make_model(graph, producer_name="fclayer-model") model = ModelWrapper(model) model.set_tensor_datatype("ifm", idt) model.set_tensor_datatype("weights", wdt) - model.set_tensor_datatype("ofm", DataType["INT32"]) # At this step, the MatMul layer does not optimize the bit-width of the output datatype + model.set_tensor_datatype( + "ofm", DataType["INT32"] + ) # At this step, the MatMul layer does not optimize the bit-width of the output datatype model.set_initializer("weights", W) - # model.set_tensor_layout("ifm", DataLayout.NHWC) return model + def prepare_inputs(input_tensor): - return {"inp": input_tensor} + return {"global_in": input_tensor} + -@pytest.mark.parametrize("mh", [16]) -@pytest.mark.parametrize("mw", [32]) -@pytest.mark.parametrize("pe", [1, 4, 16]) -#@pytest.mark.parametrize("simd", [1, 30, 90]) -@pytest.mark.parametrize("simd", [1, 4, 32]) +# @pytest.mark.parametrize("mh", [36]) +# @pytest.mark.parametrize("mw", [256]) +@pytest.mark.parametrize("mh", [9]) +@pytest.mark.parametrize("mw", [36]) +# @pytest.mark.parametrize("pe", [1, 4, 9, 36]) +# @pytest.mark.parametrize("simd", [1, 4, 16, 64, 256]) +@pytest.mark.parametrize("pe", [1, 3, 9]) +@pytest.mark.parametrize("simd", [1, 3, 6, 18, 36]) @pytest.mark.parametrize("idt", [DataType["UINT4"], DataType["UINT8"]]) -@pytest.mark.parametrize("wdt", [DataType["INT4"], DataType["INT6"]]) -#@pytest.mark.parametrize("part", ["xcvm1802-vsvd1760-2MP-e-S", "xcku3p-ffva676-1-e"]) -@pytest.mark.parametrize("part", ["xcvm1802-vsvd1760-2MP-e-S"]) -@pytest.mark.parametrize("segmentlen", [1]) +@pytest.mark.parametrize("wdt", [DataType["INT4"], DataType["INT8"]]) +# @pytest.mark.parametrize("part", ["xcvc1902-vsva2197-2MP-e-S", "xcku3p-ffva676-1-e"]) +@pytest.mark.parametrize("part", ["xcvc1902-vsva2197-2MP-e-S"]) +@pytest.mark.parametrize("clk_ns", [1.66, 4]) @pytest.mark.fpgadataflow @pytest.mark.slow @pytest.mark.vivado -def test_fpgadataflow_mvau_rtl(mh, mw, pe, simd, idt, wdt, part, segmentlen): +def test_fpgadataflow_mvau_rtl( + mh, mw, pe, simd, idt, wdt, part, clk_ns +): + if part == "xcku3p-ffva676-1-e" and clk_ns != 1.66: + pytest.skip("Skip test for varying clk for devices other than Versal, since this variable doesn't change anything for this test") + # Create test input vector (produced by SWG) ofm_shape = (5, 5) ofm_h, ofm_w = ofm_shape - ifm = helper.make_tensor_value_info( - "ifm", - TensorProto.FLOAT, - [1, ofm_h, ofm_w, mw] - ) - weights = helper.make_tensor_value_info( - "weights", - TensorProto.FLOAT, - [mw, mh] - ) + ifm = helper.make_tensor_value_info("ifm", TensorProto.FLOAT, [1, ofm_h, ofm_w, mw]) + ofm = helper.make_tensor_value_info("ofm", TensorProto.FLOAT, (1, ofm_h, ofm_w, mh)) W = gen_finn_dt_tensor(wdt, (mw, mh)) - model = make_single_matmul_modelwrapper(W, ofm_shape, mh, ifm, weights, idt, wdt) + model = make_single_matmul_modelwrapper(ifm, ofm, idt, wdt, W) model = model.transform(GiveUniqueNodeNames()) + model = model.transform(GiveReadableTensorNames()) - model.save(build_dir+"/matmul.onnx") + model.save(build_dir + "/matmul.onnx") # Create MatMul & obtain golden reference output - A = gen_finn_dt_tensor(model.get_tensor_datatype("ifm"), model.get_tensor_shape("ifm")) + A = gen_finn_dt_tensor(model.get_tensor_datatype("global_in"), model.get_tensor_shape("global_in")) input_dict = prepare_inputs(A) - ## Execute ONNX model - output_matmul = oxe.execute_onnx(model, input_dict) + # Execute ONNX model + output_matmul = oxe.execute_onnx(model, input_dict)["global_out"] + + with open(build_dir + "/onnx_output.pkl", "wb") as f: + pickle.dump(output_matmul, f) # Create MVAU (HLS) model = model.transform(to_hls.InferQuantizedMatrixVectorActivation(mem_mode="decoupled")) model = model.transform(GiveUniqueNodeNames()) - + # Apply folding (i.e. specify to use DSPs) folding_config = { "Defaults": {}, "MatrixVectorActivation_0": { - "PE" : pe, - "SIMD" : simd, - "mem_mode" : "decoupled", - "ram_style" : "auto", - "resType" : "dsp", - "impl" : "rtl" - } + "PE": pe, + "SIMD": simd, + "mem_mode": "decoupled", + "ram_style": "auto", + "resType": "dsp", + "preferred_backend" : "rtl" + }, } model = model.transform(ApplyConfig(folding_config)) - model.save(build_dir+"/mvau_hls.onnx") - - model = model.transform(SetExecMode("rtlsim")) - model = model.transform(PrepareIP(part, 5)) - model = model.transform(HLSSynthIP()) - model = model.transform(PrepareRTLSim()) - output_mvau_hls = oxe.execute_onnx(model, input_dict)["ofm"] + model.save(build_dir + "/mvau_hls.onnx") # Apply convert-to-rtl step model = model.transform(to_rtl.InferRTLMatrixVectorActivation()) model = model.transform(GiveUniqueNodeNames()) - model.save(build_dir+"/mvau_rtl.onnx") + model.save(build_dir + "/mvau_rtl.onnx") + # Reset rtlsim_so and ip-related paths such that new Pyverilator SO and IP is generated + for n in model.graph.node: + getCustomOp(n).set_nodeattr("rtlsim_trace", build_dir + "/mvu_trace_rtl_nodebynode.vcd") + model = model.transform(SetExecMode("rtlsim")) - model = model.transform(PrepareIP("xcvm1802-vsvd1760-2MP-e-S", 5)) + model = model.transform(PrepareIP(part, clk_ns)) model = model.transform(HLSSynthIP()) model = model.transform(PrepareRTLSim()) - output_mvau_rtl = oxe.execute_onnx(model, input_dict)["ofm"] + output_mvau_rtl = oxe.execute_onnx(model, input_dict)["global_out"] + + with open(build_dir + "/mvau_rtl_output.pkl", "wb") as f: + pickle.dump(output_mvau_rtl, f) + + model.save(build_dir + "/mvau_rtl_sim.onnx") + assert (output_matmul == output_mvau_rtl).all(), "Output of ONNX model not matching output of node-by-node sim!" + + model = model.transform(InsertAndSetFIFODepths(part, clk_ns)) + model = model.transform(PrepareIP(part, clk_ns)) + model = model.transform(HLSSynthIP()) + model = model.transform(CreateStitchedIP(part, clk_ns)) - model.save(build_dir+"/mvau_rtl_sim.onnx") + os.environ["RTLSIM_TRACE_DEPTH"] = "3" + model.set_metadata_prop("rtlsim_so", "") + model.set_metadata_prop("exec_mode", "rtlsim") + model.set_metadata_prop("rtlsim_trace", build_dir + "/mvu_trace_rtl_stitch.vcd") + model.save(build_dir + "/stitched_ip.onnx") + output_mvau_rtl_stitch = oxe.execute_onnx(model, input_dict)["global_out"] - assert (output_mvau_hls == output_mvau_rtl).all() - assert (output_mvau_hls.size > 0) + assert (output_matmul == output_mvau_rtl_stitch).all(), "Output of ONNX model not matching output of stitched-IP RTL model!" \ No newline at end of file From 9b2ccebba2c3689d6a1e55b6df027f461244d216 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 11 Jan 2024 14:43:46 +0000 Subject: [PATCH 095/123] [mvu vvu axi]: reworked flow control and backpressure handling by tpreusser --- finn-rtllib/mvu/mvu_vvu_axi.sv | 130 ++++++++++++++++----------------- 1 file changed, 61 insertions(+), 69 deletions(-) diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv index a3b051c9a1..0168f20563 100644 --- a/finn-rtllib/mvu/mvu_vvu_axi.sv +++ b/finn-rtllib/mvu/mvu_vvu_axi.sv @@ -62,12 +62,12 @@ module mvu_vvu_axi #( // Safely deducible parameters localparam int unsigned WEIGHT_STREAM_WIDTH = PE * SIMD * WEIGHT_WIDTH, localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (WEIGHT_STREAM_WIDTH + 7) / 8 * 8, - localparam int unsigned INPUT_STREAM_WIDTH = (IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH, + localparam int unsigned INPUT_STREAM_WIDTH = SIMD * ACTIVATION_WIDTH, localparam int unsigned INPUT_STREAM_WIDTH_BA = (INPUT_STREAM_WIDTH + 7) / 8 * 8, localparam int unsigned OUTPUT_STREAM_WIDTH = PE * ACCU_WIDTH, localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (OUTPUT_STREAM_WIDTH + 7) / 8 * 8, localparam int unsigned SF = MW / SIMD, - localparam int unsigned NF = IS_MVU ? MH / PE : 1 + localparam int unsigned NF = MH / PE ) ( // Global Control @@ -119,81 +119,73 @@ module mvu_vvu_axi #( $finish; end end - if (!IS_MVU) begin - if (COMPUTE_CORE != "mvu_vvu_8sx9_dsp58") begin - $error("VVU only supported on DSP58"); - $finish; - end - end end uwire clk = ap_clk; uwire rst = !ap_rst_n; - typedef logic [INPUT_STREAM_WIDTH-1 : 0] mvauin_t; - - uwire mvauin_t amvau; + //- Replay to Accommodate Neuron Fold ----------------------------------- + typedef logic [PE*SIMD-1:0][ACTIVATION_WIDTH-1:0] mvu_flatin_t; + uwire mvu_flatin_t amvau; uwire alast; uwire afin; uwire avld; uwire ardy; - replay_buffer #(.LEN(SF), .REP(NF), .W($bits(mvauin_t))) activation_replay ( + replay_buffer #(.LEN(SF), .REP(NF), .W($bits(mvu_flatin_t))) activation_replay ( .clk, .rst, - .ivld(s_axis_input_tvalid), .irdy(s_axis_input_tready), .idat(mvauin_t'(s_axis_input_tdata)), + .ivld(s_axis_input_tvalid), .irdy(s_axis_input_tready), .idat(mvu_flatin_t'(s_axis_input_tdata)), .ovld(avld), .ordy(ardy), .odat(amvau), .olast(alast), .ofin(afin) ); -//-------------------- Input control --------------------\\ + //- Unflatten inputs into structured matrices --------------------------- + typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH -1:0] mvu_w_t; + typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] mvu_a_t; + + uwire mvu_w_t mvu_w = s_axis_weights_tdata; + uwire mvu_a_t mvu_a = amvau; + + //- Flow Control Bracket around Compute Core ---------------------------- uwire en; uwire istb = avld && s_axis_weights_tvalid; assign ardy = en && s_axis_weights_tvalid; assign s_axis_weights_tready = en && avld; -//-------------------- Core MVU/VVU --------------------\\ - uwire ovld; - uwire [PE-1:0][ACCU_WIDTH-1:0] odat; - uwire mvauin_t amvau_i; - typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t; - - if (IS_MVU) begin : genMVUInput - assign amvau_i = amvau; - end : genMVUInput - else begin : genVVUInput - // The input stream will have the channels interleaved for VVU when PE>1 - // Hence, we need to 'untangle' the input stream, i.e. [..][SIMD*PE][..] --> [..][PE][SIMD][..] - // Note that for each 'SIMD' (S) and 'PE' (P) element, we have something like: - // (S_0, P_0), ..., (S_0, P_i), (S_1, P_0), ..., (S_1, P_i), ..., (S_i, P_i) which we need to 'untangle' to - // (S_0, P_0), ..., (S_i, P_0), (S_0, P_1), ..., (S_i,, P_1), ..., (S_i, P_i) - localparam int num_of_elements = PE*SIMD; - for (genvar i=0; i 1) ? - amvau[(i/SIMD + (i*PE % num_of_elements) + 1) * ACTIVATION_WIDTH -1: (i/SIMD + (i*PE % num_of_elements)) * ACTIVATION_WIDTH] - : amvau[i*ACTIVATION_WIDTH +: ACTIVATION_WIDTH]; - end : genRewire - end : genVVUInput + //- Instantiate compute core ---------------------------- + typedef logic [PE-1:0][ACCU_WIDTH-1:0] dsp_p_t; + uwire dsp_vld; + uwire dsp_p_t dsp_p; + + uwire dsp_clk = ap_clk; + uwire dsp_en = en; + uwire dsp_last = alast && avld; + uwire dsp_zero = !istb; + uwire mvu_w_t dsp_w = mvu_w; + uwire mvu_a_t dsp_a = mvu_a; + uwire ovld = dsp_vld; + uwire dsp_p_t odat = dsp_p; case(COMPUTE_CORE) "mvu_vvu_8sx9_dsp58": mvu_vvu_8sx9_dsp58 #(.IS_MVU(IS_MVU), .PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( - .clk, .rst, .en, - .last(alast && avld), .zero(!istb), .w(s_axis_weights_tdata), .a(amvau_i), - .vld(ovld), .p(odat) + .clk(dsp_clk), .rst, .en(dsp_en), + .last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a), + .vld(dsp_vld), .p(dsp_p) ); "mvu_4sx4u": mvu_4sx4u #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( - .clk, .rst, .en, - .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i), - .vld(ovld), .p(odat) + .clk(dsp_clk), .rst, .en(dsp_en), + .last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a), + .vld(dsp_vld), .p(dsp_p) ); "mvu_8sx8u_dsp48": mvu_8sx8u_dsp48 #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( - .clk, .rst, .en, - .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i), - .vld(ovld), .p(odat) + .clk(dsp_clk), .rst, .en(dsp_en), + .last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a), + .vld(dsp_vld), .p(dsp_p) ); default: initial begin $error("Unrecognized COMPUTE_CORE '%s'", COMPUTE_CORE); @@ -202,41 +194,41 @@ module mvu_vvu_axi #( endcase //-------------------- Output register slice --------------------\\ + // Make `en`computation independent from external inputs. + // Drive all outputs from registers. struct packed { - logic vld; + logic rdy; logic [PE-1:0][ACCU_WIDTH-1:0] dat; - } A = '{ vld: 0, default: 'x}; - - assign en = !A.vld || !ovld; - - uwire b_load; - always_ff @(posedge clk) begin - if(rst) A <= '{ vld: 0, default: 'x }; - else if(!A.vld || b_load) begin - A.vld <= ovld && en; - for(int unsigned i = 0; i < PE; i++) begin - // CR-1148862: - // A.dat[i] <= odat[i]; - automatic logic [ACCU_WIDTH-1:0] v = odat[i]; - A.dat[i] <= v[ACCU_WIDTH-1:0]; - end - end - end - + } A = '{ rdy: 1, default: 'x }; // side-step register used when encountering backpressure struct packed { logic vld; logic [PE-1:0][ACCU_WIDTH-1:0] dat; - } B = '{ vld: 0, default: 'x}; + } B = '{ vld: 0, default: 'x }; // ultimate output register + + assign en = A.rdy; + uwire b_load = !B.vld || m_axis_output_tready; - assign b_load = !B.vld || m_axis_output_tready; always_ff @(posedge clk) begin - if(rst) B <= '{ vld: 0, default: 'x }; + if(rst) begin + A <= '{ rdy: 1, default: 'x }; + B <= '{ vld: 0, default: 'x }; + end else begin - if(b_load) B <= '{ vld: A.vld, dat: A.dat}; + if(A.rdy) A.dat <= odat; + A.rdy <= (A.rdy && !ovld) || b_load; + + if(b_load) begin + B <= '{ + vld: ovld || !A.rdy, + dat: A.rdy? odat : A.dat + }; + end end end - assign m_axis_output_tvalid = B.vld; + // Why would we need a sign extension here potentially creating a higher signal load into the next FIFO? + // These extra bits should never be used. Why not 'x them out? assign m_axis_output_tdata = { {(OUTPUT_STREAM_WIDTH_BA-OUTPUT_STREAM_WIDTH){B.dat[PE-1][ACCU_WIDTH-1]}}, B.dat}; + endmodule : mvu_vvu_axi From ee9f027592e0f28deeab5cbe8d008f3be6076c92 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= Date: Wed, 31 Jan 2024 09:59:17 +0000 Subject: [PATCH 096/123] Adding DSP48E1 support for 8-bit compute. Todo: finer core differentiation to select DSP48E2 explicitly again. --- finn-rtllib/mvu/mvu_8sx8u_dsp48.sv | 165 ++++++++++++++++++++++++----- 1 file changed, 139 insertions(+), 26 deletions(-) diff --git a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv index 1e6855f779..f3cde9dea9 100644 --- a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv +++ b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv @@ -5,10 +5,9 @@ module mvu_8sx8u_dsp48 #( int unsigned ACTIVATION_WIDTH, int unsigned WEIGHT_WIDTH, + int unsigned VERSION = 1, bit SIGNED_ACTIVATIONS = 0, - bit FORCE_BEHAVIORAL = 0, - - localparam int unsigned SINGLE_PROD_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH + bit FORCE_BEHAVIORAL = 0 )( // Global Control input logic clk, @@ -49,6 +48,7 @@ module mvu_8sx8u_dsp48 #( assign vld = L[5]; // Stages #1 - #3: DSP Lanes + cross-lane canaries duplicated with SIMD parallelism + localparam int unsigned SINGLE_PROD_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH; localparam int unsigned D[2:0] = '{ ACCU_WIDTH+SINGLE_PROD_WIDTH, SINGLE_PROD_WIDTH, 0 }; // Lane offsets localparam int unsigned PIPE_COUNT = (PE+1)/2; @@ -63,8 +63,8 @@ module mvu_8sx8u_dsp48 #( for(genvar s = 0; s < SIMD; s++) begin : genSIMD // Input Lane Assembly - uwire [23:0] bb = { {(24-ACTIVATION_WIDTH){SIGNED_ACTIVATIONS && a[s][ACTIVATION_WIDTH-1]}}, a[s] }; - logic [33:0] aa; + uwire [17:0] bb = { {(18-ACTIVATION_WIDTH){SIGNED_ACTIVATIONS && a[s][ACTIVATION_WIDTH-1]}}, a[s] }; + logic [29:0] aa; logic [26:0] dd; logic [ 1:0] xx; if(1) begin : blkVectorize @@ -99,14 +99,14 @@ module mvu_8sx8u_dsp48 #( end end : blkVectorize - uwire [57:0] pp; + uwire [47:0] pp; // Note: Since the product B * AD is computed, // rst can be only applied to AD and zero only to B // with the same effect as zeroing both. if(BEHAVIORAL) begin : genBehav // Stage #1: Input Refine - logic signed [23:0] B1 = 0; + logic signed [17:0] B1 = 0; always_ff @(posedge clk) begin if(zero) B1 <= 0; else if(en) B1 <= bb; @@ -119,7 +119,7 @@ module mvu_8sx8u_dsp48 #( end // Stage #2: Multiply - logic signed [50:0] M2 = 0; + logic signed [45:0] M2 = 0; always_ff @(posedge clk) begin if(rst) M2 <= 0; else if(en) M2 <= @@ -130,7 +130,7 @@ module mvu_8sx8u_dsp48 #( end // Stage #3: Accumulate - logic signed [57:0] P3 = 0; + logic signed [47:0] P3 = 0; always_ff @(posedge clk) begin if(rst) P3 <= 0; else if(en) P3 <= M2 + (L[3]? 0 : P3); @@ -140,7 +140,115 @@ module mvu_8sx8u_dsp48 #( end : genBehav `ifndef VERILATOR else begin : genDSP - DSP48E2 #( + localparam logic [6:0] OPMODE_INVERSION = 7'b010_01_01; + uwire [6:0] opmode = { { 1'b0, L[2], 1'b0 }, 4'b00_00 }; + case(VERSION) + 1: DSP48E1 #( + // Feature Control Attributes: Data Path Selection + .A_INPUT("DIRECT"), // Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port) + .B_INPUT("DIRECT"), // Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port) + .USE_DPORT("TRUE"), // Select D port usage (TRUE or FALSE) + .USE_MULT("MULTIPLY"), // Select multiplier usage ("MULTIPLY", "DYNAMIC", or "NONE") + .USE_SIMD("ONE48"), // SIMD selection ("ONE48", "TWO24", "FOUR12") + + // Pattern Detector Attributes: Pattern Detection Configuration + .AUTORESET_PATDET("NO_RESET"), // "NO_RESET", "RESET_MATCH", "RESET_NOT_MATCH" + .MASK('1), // 48-bit mask value for pattern detect (1=ignore) + .PATTERN('0), // 48-bit pattern match for pattern detect + .SEL_MASK("MASK"), // "C", "MASK", "ROUNDING_MODE1", "ROUNDING_MODE2" + .SEL_PATTERN("PATTERN"), // Select pattern value ("PATTERN" or "C") + .USE_PATTERN_DETECT("NO_PATDET"), // Enable pattern detect ("PATDET" or "NO_PATDET") + + // Register Control Attributes: Pipeline Register Configuration + .ACASCREG(0), // Number of pipeline stages between A/ACIN and ACOUT (0, 1 or 2) + .ADREG(1), // Number of pipeline stages for pre-adder (0 or 1) + .ALUMODEREG(0), // Number of pipeline stages for ALUMODE (0 or 1) + .AREG(0), // Number of pipeline stages for A (0, 1 or 2) + .BCASCREG(1), // Number of pipeline stages between B/BCIN and BCOUT (0, 1 or 2) + .BREG(1), // Number of pipeline stages for B (0, 1 or 2) + .CARRYINREG(0), // Number of pipeline stages for CARRYIN (0 or 1) + .CARRYINSELREG(0), // Number of pipeline stages for CARRYINSEL (0 or 1) + .CREG(0), // Number of pipeline stages for C (0 or 1) + .DREG(0), // Number of pipeline stages for D (0 or 1) + .INMODEREG(0), // Number of pipeline stages for INMODE (0 or 1) + .MREG(1), // Number of multiplier pipeline stages (0 or 1) + .OPMODEREG(1), // Number of pipeline stages for OPMODE (0 or 1) + .PREG(1) // Number of pipeline stages for P (0 or 1) + ) dsp ( + // Cascade: 30-bit (each) output: Cascade Ports + .ACOUT(), // 30-bit output: A port cascade output + .BCOUT(), // 18-bit output: B port cascade output + .CARRYCASCOUT(), // 1-bit output: Cascade carry output + .MULTSIGNOUT(), // 1-bit output: Multiplier sign cascade output + .PCOUT(), // 48-bit output: Cascade output + + // Control: 1-bit (each) output: Control Inputs/Status Bits + .OVERFLOW(), // 1-bit output: Overflow in add/acc output + .PATTERNBDETECT(), // 1-bit output: Pattern bar detect output + .PATTERNDETECT(), // 1-bit output: Pattern detect output + .UNDERFLOW(), // 1-bit output: Underflow in add/acc output + + // Data: 4-bit (each) output: Data Ports + .CARRYOUT(), // 4-bit output: Carry output + .P(pp), // 48-bit output: Primary data output + + // Cascade: 30-bit (each) input: Cascade Ports + .ACIN('x), // 30-bit input: A cascade data input + .BCIN('x), // 18-bit input: B cascade input + .CARRYCASCIN('x), // 1-bit input: Cascade carry input + .MULTSIGNIN('x), // 1-bit input: Multiplier sign input + .PCIN('x), // 48-bit input: P cascade input + + // Control: 4-bit (each) input: Control Inputs/Status Bits + .CLK(clk), // 1-bit input: Clock input + .ALUMODE('0), // 4-bit input: ALU control input + .CARRYINSEL('0), // 3-bit input: Carry select input + .INMODE(5'b01100), // 5-bit input: INMODE control input + .OPMODE(opmode ^ OPMODE_INVERSION), // 7-bit input: Operation mode input + + // Data: 30-bit (each) input: Data Ports + .A(aa), // 30-bit input: A data input + .B(bb), // 18-bit input: B data input + .C('x), // 48-bit input: C data input + .CARRYIN('0), // 1-bit input: Carry input signal + .D(dd), // 25-bit input: D data input + + // Reset/Clock Enable: 1-bit (each) input: Reset/Clock Enable Inputs + .CEA1('0), // 1-bit input: Clock enable input for 1st stage AREG + .CEA2('0), // 1-bit input: Clock enable input for 2nd stage AREG + .CEAD(en), // 1-bit input: Clock enable input for ADREG + .CEALUMODE('0), // 1-bit input: Clock enable input for ALUMODERE + .CEB1('0), // 1-bit input: Clock enable input for 1st stage BREG + .CEB2(en), // 1-bit input: Clock enable input for 2nd stage BREG + .CEC('0), // 1-bit input: Clock enable input for CREG + .CECARRYIN('0), // 1-bit input: Clock enable input for CARRYINREG + .CECTRL(en), // 1-bit input: Clock enable input for OPMODEREG and CARRYINSELREG + .CED('0), // 1-bit input: Clock enable input for DREG + .CEINMODE('0), // 1-bit input: Clock enable input for INMODEREG + .CEM(en), // 1-bit input: Clock enable input for MREG + .CEP(en), // 1-bit input: Clock enable input for PREG + .RSTA('0), // 1-bit input: Reset input for AREG + .RSTB( // 1-bit input: Reset for BREG +// synthesis translate_off + rst || +// synthesis translate_on + zero + ), + .RSTC('0), // 1-bit input: Reset for CREG + .RSTD( // 1-bit input: Reset for DREG and ADREG +// synthesis translate_off + zero || +// synthesis translate_on + rst + ), + .RSTALLCARRYIN('0), // 1-bit input: Reset for CARRYINREG + .RSTALUMODE('0), // 1-bit input: Reset for ALUMODEREG + .RSTCTRL('0), // 1-bit input: Reset for OPMODEREG and CARRYINSELREG + .RSTINMODE('0), // 1-bit input: Reset for INMODE register + .RSTM(rst), // 1-bit input: Reset for MREG + .RSTP(rst) // 1-bit input: Reset for PREG + ); + 2: DSP48E2 #( // Feature Control Attributes: Data Path Selection .AMULTSEL("AD"), // Selects A input to multiplier (A, AD) .A_INPUT("DIRECT"), // Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port) @@ -163,21 +271,21 @@ module mvu_8sx8u_dsp48 #( .USE_PATTERN_DETECT("NO_PATDET"), // Enable pattern detect (NO_PATDET, PATDET) // Programmable Inversion Attributes: Specifies built-in programmable inversion on specific pins - .IS_ALUMODE_INVERTED('0), // Optional inversion for ALUMODE - .IS_CARRYIN_INVERTED('0), // Optional inversion for CARRYIN - .IS_CLK_INVERTED('0), // Optional inversion for CLK - .IS_INMODE_INVERTED('0), // Optional inversion for INMODE - .IS_OPMODE_INVERTED(9'b00_010_01_01), // Optional inversion for OPMODE - .IS_RSTALLCARRYIN_INVERTED('0), // Optional inversion for RSTALLCARRYIN - .IS_RSTALUMODE_INVERTED('0), // Optional inversion for RSTALUMODE - .IS_RSTA_INVERTED('0), // Optional inversion for RSTA - .IS_RSTB_INVERTED('0), // Optional inversion for RSTB - .IS_RSTCTRL_INVERTED('0), // Optional inversion for STCONJUGATE_A - .IS_RSTC_INVERTED('0), // Optional inversion for RSTC - .IS_RSTD_INVERTED('0), // Optional inversion for RSTD - .IS_RSTINMODE_INVERTED('0), // Optional inversion for RSTINMODE - .IS_RSTM_INVERTED('0), // Optional inversion for RSTM - .IS_RSTP_INVERTED('0), // Optional inversion for RSTP + .IS_ALUMODE_INVERTED('0), // Optional inversion for ALUMODE + .IS_CARRYIN_INVERTED('0), // Optional inversion for CARRYIN + .IS_CLK_INVERTED('0), // Optional inversion for CLK + .IS_INMODE_INVERTED('0), // Optional inversion for INMODE + .IS_OPMODE_INVERTED({ 2'b00, OPMODE_INVERSION}), // Optional inversion for OPMODE + .IS_RSTALLCARRYIN_INVERTED('0), // Optional inversion for RSTALLCARRYIN + .IS_RSTALUMODE_INVERTED('0), // Optional inversion for RSTALUMODE + .IS_RSTA_INVERTED('0), // Optional inversion for RSTA + .IS_RSTB_INVERTED('0), // Optional inversion for RSTB + .IS_RSTCTRL_INVERTED('0), // Optional inversion for STCONJUGATE_A + .IS_RSTC_INVERTED('0), // Optional inversion for RSTC + .IS_RSTD_INVERTED('0), // Optional inversion for RSTD + .IS_RSTINMODE_INVERTED('0), // Optional inversion for RSTINMODE + .IS_RSTM_INVERTED('0), // Optional inversion for RSTM + .IS_RSTP_INVERTED('0), // Optional inversion for RSTP // Register Control Attributes: Pipeline Register Configuration .ACASCREG(0), // Number of pipeline stages between A/ACIN and ACOUT (0-2) @@ -225,7 +333,7 @@ module mvu_8sx8u_dsp48 #( .ALUMODE(4'h0), // 4-bit input: ALU control .CARRYINSEL('0), // 3-bit input: Carry select .INMODE(5'b01100), // 5-bit input: INMODE control - .OPMODE({ 2'b00, { 1'b0, L[2], 1'b0 }, 4'b00_00 }), // 9-bit input: Operation mode + .OPMODE({ 2'b00, opmode }), // 9-bit input: Operation mode // Data inputs: Data Ports .A(aa), // 34-bit input: A data @@ -269,6 +377,11 @@ module mvu_8sx8u_dsp48 #( .RSTM(rst), // 1-bit input: Reset for MREG .RSTP(rst) // 1-bit input: Reset for PREG ); + default: initial begin + $error("Unknown version DSP48E%0d.", VERSION); + $finish; + end + endcase end : genDSP `endif From 3ab82966e1af64aa6ddb75f88561c5e6c86196b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= Date: Wed, 31 Jan 2024 10:15:32 +0000 Subject: [PATCH 097/123] Adding DSP48E1 support for 4-bit compute. Todo: finer core differentiation to select DSP48E2 explicitly again. --- finn-rtllib/mvu/mvu_4sx4u.sv | 169 +++++++++++++++++++++++++++++------ 1 file changed, 142 insertions(+), 27 deletions(-) diff --git a/finn-rtllib/mvu/mvu_4sx4u.sv b/finn-rtllib/mvu/mvu_4sx4u.sv index 7a2af35742..b49315637f 100644 --- a/finn-rtllib/mvu/mvu_4sx4u.sv +++ b/finn-rtllib/mvu/mvu_4sx4u.sv @@ -2,8 +2,10 @@ module mvu_4sx4u #( int unsigned PE, int unsigned SIMD, int unsigned ACCU_WIDTH, - bit SIGNED_ACTIVATIONS = 0, - bit FORCE_BEHAVIORAL = 0 + + int unsigned VERSION = 1, + bit SIGNED_ACTIVATIONS = 0, + bit FORCE_BEHAVIORAL = 0 )( // Global Control input logic clk, @@ -14,7 +16,7 @@ module mvu_4sx4u #( input logic last, input logic zero, // ignore current inputs and force this partial product to zero input logic signed [PE-1:0][SIMD-1:0][3:0] w, // signed weights - input logic [SIMD-1:0][3:0] a, // unsigned activations + input logic [SIMD-1:0][3:0] a, // unsigned activations (override by SIGNED_ACTIVATIONS) // Ouput output logic vld, @@ -58,8 +60,8 @@ module mvu_4sx4u #( for(genvar s = 0; s < SIMD; s++) begin : genSIMD // Input Lane Assembly - uwire [23:0] bb = { {(20){SIGNED_ACTIVATIONS && a[s][3]}}, a[s] }; - logic [33:0] aa; + uwire [17:0] bb = { {(14){SIGNED_ACTIVATIONS && a[s][3]}}, a[s] }; + logic [29:0] aa; logic [26:0] dd; logic [ 1:0] xx[3:1]; if(1) begin : blkVectorize @@ -94,14 +96,14 @@ module mvu_4sx4u #( end end : blkVectorize - uwire [57:0] pp; + uwire [47:0] pp; // Note: Since the product B * AD is computed, // rst can be only applied to AD and zero only to B // with the same effect as zeroing both. - if (BEHAVIORAL) begin : genBehav + if(BEHAVIORAL) begin : genBehav // Stage #1: Input Refine - logic signed [23:0] B1 = 0; + logic signed [17:0] B1 = 0; always_ff @(posedge clk) begin if(zero) B1 <= 0; else if(en) B1 <= bb; @@ -114,7 +116,7 @@ module mvu_4sx4u #( end // Stage #2: Multiply - logic signed [50:0] M2 = 0; + logic signed [45:0] M2 = 0; always_ff @(posedge clk) begin if(rst) M2 <= 0; else if(en) M2 <= @@ -125,7 +127,7 @@ module mvu_4sx4u #( end // Stage #3: Accumulate - logic signed [57:0] P3 = 0; + logic signed [47:0] P3 = 0; always_ff @(posedge clk) begin if(rst) P3 <= 0; else if(en) P3 <= M2 + (L[3]? 0 : P3); @@ -135,7 +137,115 @@ module mvu_4sx4u #( end : genBehav `ifndef VERILATOR else begin : genDSP - DSP48E2 #( + localparam logic [6:0] OPMODE_INVERSION = 7'b010_01_01; + uwire [6:0] opmode = { { 1'b0, L[2], 1'b0 }, 4'b00_00 }; + case(VERSION) + 1: DSP48E1 #( + // Feature Control Attributes: Data Path Selection + .A_INPUT("DIRECT"), // Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port) + .B_INPUT("DIRECT"), // Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port) + .USE_DPORT("TRUE"), // Select D port usage (TRUE or FALSE) + .USE_MULT("MULTIPLY"), // Select multiplier usage ("MULTIPLY", "DYNAMIC", or "NONE") + .USE_SIMD("ONE48"), // SIMD selection ("ONE48", "TWO24", "FOUR12") + + // Pattern Detector Attributes: Pattern Detection Configuration + .AUTORESET_PATDET("NO_RESET"), // "NO_RESET", "RESET_MATCH", "RESET_NOT_MATCH" + .MASK('1), // 48-bit mask value for pattern detect (1=ignore) + .PATTERN('0), // 48-bit pattern match for pattern detect + .SEL_MASK("MASK"), // "C", "MASK", "ROUNDING_MODE1", "ROUNDING_MODE2" + .SEL_PATTERN("PATTERN"), // Select pattern value ("PATTERN" or "C") + .USE_PATTERN_DETECT("NO_PATDET"), // Enable pattern detect ("PATDET" or "NO_PATDET") + + // Register Control Attributes: Pipeline Register Configuration + .ACASCREG(0), // Number of pipeline stages between A/ACIN and ACOUT (0, 1 or 2) + .ADREG(1), // Number of pipeline stages for pre-adder (0 or 1) + .ALUMODEREG(0), // Number of pipeline stages for ALUMODE (0 or 1) + .AREG(0), // Number of pipeline stages for A (0, 1 or 2) + .BCASCREG(1), // Number of pipeline stages between B/BCIN and BCOUT (0, 1 or 2) + .BREG(1), // Number of pipeline stages for B (0, 1 or 2) + .CARRYINREG(0), // Number of pipeline stages for CARRYIN (0 or 1) + .CARRYINSELREG(0), // Number of pipeline stages for CARRYINSEL (0 or 1) + .CREG(0), // Number of pipeline stages for C (0 or 1) + .DREG(0), // Number of pipeline stages for D (0 or 1) + .INMODEREG(0), // Number of pipeline stages for INMODE (0 or 1) + .MREG(1), // Number of multiplier pipeline stages (0 or 1) + .OPMODEREG(1), // Number of pipeline stages for OPMODE (0 or 1) + .PREG(1) // Number of pipeline stages for P (0 or 1) + ) dsp ( + // Cascade: 30-bit (each) output: Cascade Ports + .ACOUT(), // 30-bit output: A port cascade output + .BCOUT(), // 18-bit output: B port cascade output + .CARRYCASCOUT(), // 1-bit output: Cascade carry output + .MULTSIGNOUT(), // 1-bit output: Multiplier sign cascade output + .PCOUT(), // 48-bit output: Cascade output + + // Control: 1-bit (each) output: Control Inputs/Status Bits + .OVERFLOW(), // 1-bit output: Overflow in add/acc output + .PATTERNBDETECT(), // 1-bit output: Pattern bar detect output + .PATTERNDETECT(), // 1-bit output: Pattern detect output + .UNDERFLOW(), // 1-bit output: Underflow in add/acc output + + // Data: 4-bit (each) output: Data Ports + .CARRYOUT(), // 4-bit output: Carry output + .P(pp), // 48-bit output: Primary data output + + // Cascade: 30-bit (each) input: Cascade Ports + .ACIN('x), // 30-bit input: A cascade data input + .BCIN('x), // 18-bit input: B cascade input + .CARRYCASCIN('x), // 1-bit input: Cascade carry input + .MULTSIGNIN('x), // 1-bit input: Multiplier sign input + .PCIN('x), // 48-bit input: P cascade input + + // Control: 4-bit (each) input: Control Inputs/Status Bits + .CLK(clk), // 1-bit input: Clock input + .ALUMODE('0), // 4-bit input: ALU control input + .CARRYINSEL('0), // 3-bit input: Carry select input + .INMODE(5'b01100), // 5-bit input: INMODE control input + .OPMODE(opmode ^ OPMODE_INVERSION), // 7-bit input: Operation mode input + + // Data: 30-bit (each) input: Data Ports + .A(aa), // 30-bit input: A data input + .B(bb), // 18-bit input: B data input + .C('x), // 48-bit input: C data input + .CARRYIN('0), // 1-bit input: Carry input signal + .D(dd), // 25-bit input: D data input + + // Reset/Clock Enable: 1-bit (each) input: Reset/Clock Enable Inputs + .CEA1('0), // 1-bit input: Clock enable input for 1st stage AREG + .CEA2('0), // 1-bit input: Clock enable input for 2nd stage AREG + .CEAD(en), // 1-bit input: Clock enable input for ADREG + .CEALUMODE('0), // 1-bit input: Clock enable input for ALUMODERE + .CEB1('0), // 1-bit input: Clock enable input for 1st stage BREG + .CEB2(en), // 1-bit input: Clock enable input for 2nd stage BREG + .CEC('0), // 1-bit input: Clock enable input for CREG + .CECARRYIN('0), // 1-bit input: Clock enable input for CARRYINREG + .CECTRL(en), // 1-bit input: Clock enable input for OPMODEREG and CARRYINSELREG + .CED('0), // 1-bit input: Clock enable input for DREG + .CEINMODE('0), // 1-bit input: Clock enable input for INMODEREG + .CEM(en), // 1-bit input: Clock enable input for MREG + .CEP(en), // 1-bit input: Clock enable input for PREG + .RSTA('0), // 1-bit input: Reset input for AREG + .RSTB( // 1-bit input: Reset for BREG +// synthesis translate_off + rst || +// synthesis translate_on + zero + ), + .RSTC('0), // 1-bit input: Reset for CREG + .RSTD( // 1-bit input: Reset for DREG and ADREG +// synthesis translate_off + zero || +// synthesis translate_on + rst + ), + .RSTALLCARRYIN('0), // 1-bit input: Reset for CARRYINREG + .RSTALUMODE('0), // 1-bit input: Reset for ALUMODEREG + .RSTCTRL('0), // 1-bit input: Reset for OPMODEREG and CARRYINSELREG + .RSTINMODE('0), // 1-bit input: Reset for INMODE register + .RSTM(rst), // 1-bit input: Reset for MREG + .RSTP(rst) // 1-bit input: Reset for PREG + ); + 2: DSP48E2 #( // Feature Control Attributes: Data Path Selection .AMULTSEL("AD"), // Selects A input to multiplier (A, AD) .A_INPUT("DIRECT"), // Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port) @@ -158,21 +268,21 @@ module mvu_4sx4u #( .USE_PATTERN_DETECT("NO_PATDET"), // Enable pattern detect (NO_PATDET, PATDET) // Programmable Inversion Attributes: Specifies built-in programmable inversion on specific pins - .IS_ALUMODE_INVERTED('0), // Optional inversion for ALUMODE - .IS_CARRYIN_INVERTED('0), // Optional inversion for CARRYIN - .IS_CLK_INVERTED('0), // Optional inversion for CLK - .IS_INMODE_INVERTED('0), // Optional inversion for INMODE - .IS_OPMODE_INVERTED(9'b00_010_01_01), // Optional inversion for OPMODE - .IS_RSTALLCARRYIN_INVERTED('0), // Optional inversion for RSTALLCARRYIN - .IS_RSTALUMODE_INVERTED('0), // Optional inversion for RSTALUMODE - .IS_RSTA_INVERTED('0), // Optional inversion for RSTA - .IS_RSTB_INVERTED('0), // Optional inversion for RSTB - .IS_RSTCTRL_INVERTED('0), // Optional inversion for STCONJUGATE_A - .IS_RSTC_INVERTED('0), // Optional inversion for RSTC - .IS_RSTD_INVERTED('0), // Optional inversion for RSTD - .IS_RSTINMODE_INVERTED('0), // Optional inversion for RSTINMODE - .IS_RSTM_INVERTED('0), // Optional inversion for RSTM - .IS_RSTP_INVERTED('0), // Optional inversion for RSTP + .IS_ALUMODE_INVERTED('0), // Optional inversion for ALUMODE + .IS_CARRYIN_INVERTED('0), // Optional inversion for CARRYIN + .IS_CLK_INVERTED('0), // Optional inversion for CLK + .IS_INMODE_INVERTED('0), // Optional inversion for INMODE + .IS_OPMODE_INVERTED({ 2'b00, OPMODE_INVERSION}), // Optional inversion for OPMODE + .IS_RSTALLCARRYIN_INVERTED('0), // Optional inversion for RSTALLCARRYIN + .IS_RSTALUMODE_INVERTED('0), // Optional inversion for RSTALUMODE + .IS_RSTA_INVERTED('0), // Optional inversion for RSTA + .IS_RSTB_INVERTED('0), // Optional inversion for RSTB + .IS_RSTCTRL_INVERTED('0), // Optional inversion for STCONJUGATE_A + .IS_RSTC_INVERTED('0), // Optional inversion for RSTC + .IS_RSTD_INVERTED('0), // Optional inversion for RSTD + .IS_RSTINMODE_INVERTED('0), // Optional inversion for RSTINMODE + .IS_RSTM_INVERTED('0), // Optional inversion for RSTM + .IS_RSTP_INVERTED('0), // Optional inversion for RSTP // Register Control Attributes: Pipeline Register Configuration .ACASCREG(0), // Number of pipeline stages between A/ACIN and ACOUT (0-2) @@ -220,7 +330,7 @@ module mvu_4sx4u #( .ALUMODE(4'h0), // 4-bit input: ALU control .CARRYINSEL('0), // 3-bit input: Carry select .INMODE(5'b01100), // 5-bit input: INMODE control - .OPMODE({ 2'b00, { 1'b0, L[2], 1'b0 }, 4'b00_00 }), // 9-bit input: Operation mode + .OPMODE({ 2'b00, opmode }), // 9-bit input: Operation mode // Data inputs: Data Ports .A(aa), // 34-bit input: A data @@ -264,6 +374,11 @@ module mvu_4sx4u #( .RSTM(rst), // 1-bit input: Reset for MREG .RSTP(rst) // 1-bit input: Reset for PREG ); + default: initial begin + $error("Unknown version DSP48E%0d.", VERSION); + $finish; + end + endcase end : genDSP `endif From 23c3f82a87a405d996ad6e3b096ca9352314adf1 Mon Sep 17 00:00:00 2001 From: johnnoel Date: Wed, 31 Jan 2024 10:36:52 +0000 Subject: [PATCH 098/123] [Tests] Temporarily re-enable SWG exception for bnn_w2_a2_cnv_Pynq-Z1 test --- tests/end2end/test_end2end_bnn_pynq.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tests/end2end/test_end2end_bnn_pynq.py b/tests/end2end/test_end2end_bnn_pynq.py index b296dad827..9fb41ec78e 100644 --- a/tests/end2end/test_end2end_bnn_pynq.py +++ b/tests/end2end/test_end2end_bnn_pynq.py @@ -653,7 +653,13 @@ def test_set_fifo_depths(self, topology, wbits, abits, board): prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "ipgen_" + board) model = load_test_checkpoint_or_skip(prev_chkpt_name) test_fpga_part = get_build_env(board, target_clk_ns)["part"] - model = model.transform(InsertAndSetFIFODepths(test_fpga_part, target_clk_ns)) + if topology == "cnv" and wbits == 2 and abits == 2 and board == "Pynq-Z1": + # Enabling swg_exception for this single test case. Disabling the exception results in a design + # that exceeds the resources of the Pynq-Z1 board. In future this should be revisited and handled + # correctly as the swg_exception is poorly justified. + model = model.transform(InsertAndSetFIFODepths(test_fpga_part, target_clk_ns, swg_exception=True)) + else: + model = model.transform(InsertAndSetFIFODepths(test_fpga_part, target_clk_ns)) fifo_layers = model.get_nodes_by_op_type("StreamingFIFO") assert len(fifo_layers) > 0 model.save(get_checkpoint_name(topology, wbits, abits, "fifodepth_" + board)) From 562d153b96c96ac28968d01a9f09b2be9471ea17 Mon Sep 17 00:00:00 2001 From: johnnoel Date: Wed, 31 Jan 2024 13:37:50 +0000 Subject: [PATCH 099/123] [Tests] Fix fpgadataflow split large fifos test --- tests/fpgadataflow/test_split_large_fifos.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/fpgadataflow/test_split_large_fifos.py b/tests/fpgadataflow/test_split_large_fifos.py index 3061696a68..653e1e7896 100644 --- a/tests/fpgadataflow/test_split_large_fifos.py +++ b/tests/fpgadataflow/test_split_large_fifos.py @@ -54,7 +54,7 @@ def fetch_test_model(topology, wbits=2, abits=2): def get_folding_cfg(depth=65536): cfg = dict() cfg["Defaults"] = dict() - for i in range(3): + for i in range(4): key = "StreamingFIFO_" + str(i) cfg[key] = {"depth": depth, "ram_style": "auto", "impl_style": "vivado"} return cfg From a884e11ff52023e68a0f798c47bf777bacb873df Mon Sep 17 00:00:00 2001 From: johnnoel Date: Wed, 31 Jan 2024 13:48:05 +0000 Subject: [PATCH 100/123] Fix linting --- tests/end2end/test_end2end_bnn_pynq.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tests/end2end/test_end2end_bnn_pynq.py b/tests/end2end/test_end2end_bnn_pynq.py index 9fb41ec78e..db065fec42 100644 --- a/tests/end2end/test_end2end_bnn_pynq.py +++ b/tests/end2end/test_end2end_bnn_pynq.py @@ -654,10 +654,12 @@ def test_set_fifo_depths(self, topology, wbits, abits, board): model = load_test_checkpoint_or_skip(prev_chkpt_name) test_fpga_part = get_build_env(board, target_clk_ns)["part"] if topology == "cnv" and wbits == 2 and abits == 2 and board == "Pynq-Z1": - # Enabling swg_exception for this single test case. Disabling the exception results in a design - # that exceeds the resources of the Pynq-Z1 board. In future this should be revisited and handled - # correctly as the swg_exception is poorly justified. - model = model.transform(InsertAndSetFIFODepths(test_fpga_part, target_clk_ns, swg_exception=True)) + # Enabling swg_exception for this single test case. Disabling the exception results in + # a design that exceeds the resources of the Pynq-Z1 board. In future this should be + # revisited and handled correctly as the swg_exception is poorly justified. + model = model.transform( + InsertAndSetFIFODepths(test_fpga_part, target_clk_ns, swg_exception=True) + ) else: model = model.transform(InsertAndSetFIFODepths(test_fpga_part, target_clk_ns)) fifo_layers = model.get_nodes_by_op_type("StreamingFIFO") From bcd72ad90f066ffab173dd0c132e553a1f4b2cd6 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Wed, 1 Nov 2023 15:20:07 +0000 Subject: [PATCH 101/123] [mvu vvu axi]: minor bugfixes to enable VVU --- finn-rtllib/mvu/mvu_vvu_axi.sv | 55 +++++++++++++++++++--------------- 1 file changed, 31 insertions(+), 24 deletions(-) diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv index 0168f20563..014481b29a 100644 --- a/finn-rtllib/mvu/mvu_vvu_axi.sv +++ b/finn-rtllib/mvu/mvu_vvu_axi.sv @@ -60,14 +60,13 @@ module mvu_vvu_axi #( bit M_REG_LUT = 1, // Safely deducible parameters - localparam int unsigned WEIGHT_STREAM_WIDTH = PE * SIMD * WEIGHT_WIDTH, - localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (WEIGHT_STREAM_WIDTH + 7) / 8 * 8, - localparam int unsigned INPUT_STREAM_WIDTH = SIMD * ACTIVATION_WIDTH, - localparam int unsigned INPUT_STREAM_WIDTH_BA = (INPUT_STREAM_WIDTH + 7) / 8 * 8, - localparam int unsigned OUTPUT_STREAM_WIDTH = PE * ACCU_WIDTH, - localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (OUTPUT_STREAM_WIDTH + 7) / 8 * 8, - localparam int unsigned SF = MW / SIMD, - localparam int unsigned NF = MH / PE + localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8, + localparam int unsigned INPUT_STREAM_WIDTH_BA = ((IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8, + localparam int unsigned WEIGHT_STREAM_WIDTH = PE*SIMD*WEIGHT_WIDTH, + localparam int unsigned INPUT_STREAM_WIDTH = (IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH, + localparam int unsigned SF = MW/SIMD, + localparam int unsigned NF = IS_MVU ? MH/PE : 1, + localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8 ) ( // Global Control @@ -151,28 +150,36 @@ module mvu_vvu_axi #( assign ardy = en && s_axis_weights_tvalid; assign s_axis_weights_tready = en && avld; - //- Instantiate compute core ---------------------------- - typedef logic [PE-1:0][ACCU_WIDTH-1:0] dsp_p_t; - uwire dsp_vld; - uwire dsp_p_t dsp_p; - - uwire dsp_clk = ap_clk; - uwire dsp_en = en; - uwire dsp_last = alast && avld; - uwire dsp_zero = !istb; - uwire mvu_w_t dsp_w = mvu_w; - uwire mvu_a_t dsp_a = mvu_a; - uwire ovld = dsp_vld; - uwire dsp_p_t odat = dsp_p; +//-------------------- Core MVU/VVU --------------------\\ + uwire ovld; + uwire [PE-1:0][ACCU_WIDTH-1:0] odat; + uwire mvauin_t amvau_i; + + if (IS_MVU) begin : genMVUInput + assign amvau_i = amvau; + end : genMVUInput + else begin : genVVUInput + // The input stream will have the channels interleaved for VVU when PE>1 + // Hence, we need to 'untangle' the input stream, i.e. [..][SIMD*PE][..] --> [..][PE][SIMD][..] + // Note that for each 'SIMD' (S) and 'PE' (P) element, we have something like: + // (S_0, P_0), ..., (S_0, P_i), (S_1, P_0), ..., (S_1, P_i), ..., (S_i, P_i) which we need to 'untangle' to + // (S_0, P_0), ..., (S_i, P_0), (S_0, P_1), ..., (S_i,, P_1), ..., (S_i, P_i) + localparam int num_of_elements = PE*SIMD; + for (genvar i=0; i 1) ? + amvau[(i/SIMD + (i*PE % num_of_elements) + 1) * ACTIVATION_WIDTH -1: (i/SIMD + (i*PE % num_of_elements)) * ACTIVATION_WIDTH] + : amvau[i*ACTIVATION_WIDTH +: ACTIVATION_WIDTH]; + end : genRewire + end : genVVUInput case(COMPUTE_CORE) "mvu_vvu_8sx9_dsp58": mvu_vvu_8sx9_dsp58 #(.IS_MVU(IS_MVU), .PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( - .clk(dsp_clk), .rst, .en(dsp_en), - .last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a), - .vld(dsp_vld), .p(dsp_p) + .clk, .rst, .en, + .last(alast && avld), .zero(!istb), .w(s_axis_weights_tdata), .a(amvau_i), + .vld(ovld), .p(odat) ); "mvu_4sx4u": mvu_4sx4u #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( From b1167334cf206f8cc550018594e989e9798768ce Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Wed, 1 Nov 2023 15:26:30 +0000 Subject: [PATCH 102/123] [mvu tb]: created separate vvu testbench and renamed mvu_vvu_axi tb --- .../tb/{mvu_vvu_axi_tb.sv => mvu_axi_tb.sv} | 16 +- finn-rtllib/mvu/tb/vvu_axi_tb.sv | 227 ++++++++++++++++++ 2 files changed, 235 insertions(+), 8 deletions(-) rename finn-rtllib/mvu/tb/{mvu_vvu_axi_tb.sv => mvu_axi_tb.sv} (96%) create mode 100644 finn-rtllib/mvu/tb/vvu_axi_tb.sv diff --git a/finn-rtllib/mvu/tb/mvu_vvu_axi_tb.sv b/finn-rtllib/mvu/tb/mvu_axi_tb.sv similarity index 96% rename from finn-rtllib/mvu/tb/mvu_vvu_axi_tb.sv rename to finn-rtllib/mvu/tb/mvu_axi_tb.sv index b46fc588c9..8614e9f811 100644 --- a/finn-rtllib/mvu/tb/mvu_vvu_axi_tb.sv +++ b/finn-rtllib/mvu/tb/mvu_axi_tb.sv @@ -31,24 +31,24 @@ * @brief Testbench for MVU AXI-lite interface wrapper. *****************************************************************************/ -module mvu_vvu_axi_tb(); +module mvu_axi_tb(); //-------------------- Simulation parameters --------------------\\ // Matrix & parallelism config localparam bit IS_MVU = 0; localparam string COMPUTE_CORE = "mvu_vvu_8sx9_dsp58"; localparam int unsigned MW = 36; - localparam int unsigned MH = 1; - localparam int unsigned SIMD = 3; + localparam int unsigned MH = 4; + localparam int unsigned SIMD = 36; localparam int unsigned PE = 4; - localparam int unsigned SEGMENTLEN = 1.0; + localparam int unsigned SEGMENTLEN = 2.0; localparam bit FORCE_BEHAVIORAL = 1; localparam bit M_REG_LUT = 1; // Bit-width config - localparam int unsigned ACTIVATION_WIDTH = 8; - localparam int unsigned WEIGHT_WIDTH = 6; + localparam int unsigned ACTIVATION_WIDTH = 4; + localparam int unsigned WEIGHT_WIDTH = 4; localparam int unsigned ACCU_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW); - localparam bit SIGNED_ACTIVATIONS = 1; + localparam bit SIGNED_ACTIVATIONS = 0; // Simulation constants localparam int unsigned NF = IS_MVU ? MH/PE : 1; localparam int unsigned SF = IS_MVU ? MW/SIMD : MW/(SIMD*PE); @@ -238,4 +238,4 @@ module mvu_vvu_axi_tb(); .m_axis_output_tready(outputs.rdy) ); -endmodule : mvu_vvu_axi_tb +endmodule : mvu_axi_tb diff --git a/finn-rtllib/mvu/tb/vvu_axi_tb.sv b/finn-rtllib/mvu/tb/vvu_axi_tb.sv new file mode 100644 index 0000000000..fbb45845e1 --- /dev/null +++ b/finn-rtllib/mvu/tb/vvu_axi_tb.sv @@ -0,0 +1,227 @@ +/****************************************************************************** + * Copyright (C) 2022, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Testbench for MVU AXI-lite interface wrapper. + *****************************************************************************/ + +module vvu_axi_tb(); + +//-------------------- Simulation parameters --------------------\\ + // Matrix & parallelism config + localparam bit IS_MVU = 0; + localparam string COMPUTE_CORE = "mvu_vvu_8sx9_dsp58"; + localparam int unsigned MW = 25; // Kernel*Kernel + localparam int unsigned MH = 4; // Channels + localparam int unsigned SIMD = 25; // MW%SIMD == 0 + localparam int unsigned PE = 2; // MH%PE == 0 + localparam int unsigned SEGMENTLEN = 3.0; + localparam bit FORCE_BEHAVIORAL = 1; + localparam bit M_REG_LUT = 1; + // Bit-width config + localparam int unsigned ACTIVATION_WIDTH = 4; + localparam int unsigned WEIGHT_WIDTH = 4; + localparam int unsigned ACCU_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW); + localparam bit SIGNED_ACTIVATIONS = 1; + // Simulation constants + localparam int unsigned NF = MH/PE; + localparam int unsigned SF = MW/SIMD; + localparam int unsigned WEIGHT_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8*8; + localparam int unsigned ACTIVATION_WIDTH_BA = (PE*SIMD*ACTIVATION_WIDTH+7)/8*8; + localparam int unsigned WEIGHT_WIDTH_BA_DELTA = WEIGHT_WIDTH_BA - PE*SIMD*WEIGHT_WIDTH; + localparam int unsigned ACTIVATION_WIDTH_BA_DELTA = ACTIVATION_WIDTH_BA - PE*SIMD*ACTIVATION_WIDTH; + localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8; + + // Generate clk and reset signal + logic clk = 0; + always #5ns clk = !clk; + + logic ap_rst_n = 0; + initial begin + repeat(16) @(posedge clk); + ap_rst_n <= 1; + end + + uwire ap_clk = clk; + + // Generate activations + typedef logic [PE*SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t; + typedef activation_t activation_vector_t[NF*SF]; + + function activation_vector_t init_ACTIVATIONS; + automatic activation_vector_t res; + std::randomize(res); + return res; + endfunction : init_ACTIVATIONS + + activation_vector_t ACTIVATIONS = init_ACTIVATIONS(); + + struct { + activation_t dat; + logic vld; + logic rdy; + } activations; + + initial begin + activations.vld = 0; + activations.dat = 'X; + @(posedge clk iff ap_rst_n); + + for (int i=0; i= 0; + @(posedge clk); + end while (!(activations.vld === 1 && activations.rdy === 1)); + end + + activations.vld <= 0; + activations.dat <= 'x; + end + + // Generate weights + typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t; + typedef weight_t weight_matrix_t[NF][SF]; + + function weight_matrix_t init_WEIGHTS; + automatic weight_matrix_t res; + std::randomize(res); + return res; + endfunction : init_WEIGHTS; + + weight_matrix_t WEIGHTS = init_WEIGHTS(); + + struct { + weight_t dat; + logic vld; + logic rdy; + } weights; + + initial begin + weights.vld = 0; + weights.dat = 'X; + @(posedge clk iff ap_rst_n); + + weights.vld <= 1; + for (int i=0; i1 + // Hence, we need to 'untangle' the input stream, i.e. [..][SIMD*PE][..] --> [..][PE][SIMD][..] + // Note that for each 'SIMD' (S) and 'PE' (P) element, we have something like: + // (S_0, P_0), ..., (S_0, P_i), (S_1, P_0), ..., (S_1, P_i), ..., (S_i, P_i) which we need to 'untangle' to + // (S_0, P_0), ..., (S_i, P_0), (S_0, P_1), ..., (S_i,, P_1), ..., (S_i, P_i) + for (int i = 0; i < NF; i++) begin + for (int j = 0; j < SF; j++) begin + for (int k = 0; k < PE; k++) begin + for (int l = 0; l < SIMD; l++) begin + if (SIGNED_ACTIVATIONS) + res[i][k] = $signed(res[i][k]) + $signed(a[i*SF+j][k + l*PE]) * $signed(w[i][j][k][l]); + else + res[i][k] = $signed(res[i][k]) + $signed({1'b0, a[i*SF+j][k + l*PE]}) * $signed(w[i][j][k][l]); + end + end + end + end + return res; + endfunction : check_output; + + output_vector_t GOLDEN_OUTPUT = check_output(ACTIVATIONS, WEIGHTS); + + int unsigned NF_CNT = 0; + initial begin + outputs.rdy = 0; + while (NF_CNT < NF) begin + // Loop until both rdy & vld are asserted + do begin + outputs.rdy <= $urandom()%7 >= 0; + @(posedge clk iff ap_rst_n); + end while (!(outputs.rdy === 1 && outputs.vld === 1)); + + // Compare produced outputs against golden outputs + foreach(outputs.dat[i]) begin + assert ($signed(outputs.dat[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); + else begin + $error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); + $stop; + end + end + + NF_CNT += 1; + end + + $finish; + end + + // Instantiate DUT + mvu_vvu_axi #( + .IS_MVU(IS_MVU), + .COMPUTE_CORE(COMPUTE_CORE), + .MW(MW), + .MH(MH), + .PE(PE), + .SIMD(SIMD), + .ACTIVATION_WIDTH(ACTIVATION_WIDTH), + .WEIGHT_WIDTH(WEIGHT_WIDTH), + .ACCU_WIDTH(ACCU_WIDTH), + .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), + .SEGMENTLEN(SEGMENTLEN), + .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL), + .M_REG_LUT(M_REG_LUT) + ) + dut ( + .ap_clk, .ap_rst_n, .s_axis_weights_tdata({ {WEIGHT_WIDTH_BA_DELTA{1'b0}}, weights.dat }), .s_axis_weights_tvalid(weights.vld), + .s_axis_weights_tready(weights.rdy), .s_axis_input_tdata({ {ACTIVATION_WIDTH_BA_DELTA{1'b0}}, activations.dat }), .s_axis_input_tvalid(activations.vld), + .s_axis_input_tready(activations.rdy), .m_axis_output_tdata(outputs.dat), .m_axis_output_tvalid(outputs.vld), + .m_axis_output_tready(outputs.rdy) + ); + +endmodule : vvu_axi_tb From e1f8db14faf969c422b6f362c0b9329a8be6269e Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 20 Nov 2023 14:35:45 +0000 Subject: [PATCH 103/123] [mvu vvu axi]: minor fix -- define mvauin_weight_t --- finn-rtllib/mvu/mvu_vvu_axi.sv | 1 + 1 file changed, 1 insertion(+) diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv index 014481b29a..20be83910a 100644 --- a/finn-rtllib/mvu/mvu_vvu_axi.sv +++ b/finn-rtllib/mvu/mvu_vvu_axi.sv @@ -154,6 +154,7 @@ module mvu_vvu_axi #( uwire ovld; uwire [PE-1:0][ACCU_WIDTH-1:0] odat; uwire mvauin_t amvau_i; + typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t; if (IS_MVU) begin : genMVUInput assign amvau_i = amvau; From 88da6965ee560b53f672229012eccca2c343111a Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 20 Nov 2023 14:43:58 +0000 Subject: [PATCH 104/123] [folding]: first attempt to extend folding transformation to parallelize multi-packed DSPs in MVU/VVU more efficiently --- .../fpgadataflow/set_folding.py | 75 +++++++++++++++---- 1 file changed, 60 insertions(+), 15 deletions(-) diff --git a/src/finn/transformation/fpgadataflow/set_folding.py b/src/finn/transformation/fpgadataflow/set_folding.py index eca1053f8f..871919f3f2 100644 --- a/src/finn/transformation/fpgadataflow/set_folding.py +++ b/src/finn/transformation/fpgadataflow/set_folding.py @@ -31,6 +31,7 @@ from qonnx.custom_op.registry import getCustomOp from qonnx.transformation.base import Transformation from qonnx.transformation.general import GiveUniqueNodeNames +from qonnx.core.datatype import DataType from finn.analysis.fpgadataflow.dataflow_performance import dataflow_performance from finn.transformation.fpgadataflow.annotate_cycles import AnnotateCycles @@ -80,11 +81,12 @@ class SetFolding(Transformation): unfolded before SIMD is increased """ - def __init__(self, target_cycles_per_frame=1000, mvau_wwidth_max=36, two_pass_relaxation=True): + def __init__(self, target_cycles_per_frame=1000, mvau_wwidth_max=36, two_pass_relaxation=True, fpga_part=None): super().__init__() self.target_cycles_per_frame = target_cycles_per_frame self.mvau_wwidth_max = mvau_wwidth_max self.two_pass_relaxation = two_pass_relaxation + self.fpga_part = fpga_part def optimize_attribute_val(self, node_inst, max_val, attr_name): node_inst.set_nodeattr(attr_name, 1) @@ -95,6 +97,10 @@ def optimize_attribute_val(self, node_inst, max_val, attr_name): # finish if target met break + def _is_versal(self, fpga_part): + assert fpga_part is not None, "Please specify a target board before setting the folding configuration for a more efficient folding configuration for RTL-based MVU/VVU" + return fpga_part[0:4] in ["xcvc", "xcve", "xcvp", "xcvm", "xqvc", "xqvm"] or fpga_partt[0:5] == "xqrvc" + def apply(self, model): graph = model.graph # these ops use PE parallelism, up to a max value of NumChannels @@ -112,13 +118,14 @@ def apply(self, model): simd_ops = [ "DownSampler", "FMPadding_Batch", + "FMPadding_Batch_rtl", "ConvolutionInputGenerator", "ConvolutionInputGenerator1D", "ConvolutionInputGenerator_rtl", ] # these ops are preceded by depthwise SWG and have special behavior, # as explained in the SetFolding docstring - depthwise_op_exceptions = ["VectorVectorActivation", "Pool_Batch"] + depthwise_op_exceptions = ["VectorVectorActivation", "VectorVectorActivation_rtl", "Pool_Batch"] for node in graph.node: if not is_fpgadataflow_node(node): continue @@ -148,6 +155,37 @@ def apply(self, model): break # increase PE until target met or reached max_pe self.optimize_attribute_val(node_inst, max_pe, "PE") + if op_type == "MatrixVectorActivation_rtl": + max_simd = node_inst.get_nodeattr("MW") + max_pe = node_inst.get_nodeattr("MH") + node_inst.set_nodeattr("PE", 1) + node_inst.set_nodeattr("SIMD", 1) + # Depending on the board and the layer's config, either the + # SIMD or PE folding dimension would be preferred to enable efficient DSP-packing + act_width = DataType[node_inst.get_nodeattr("inputDataType")].bitwidth() + weight_width = DataType[node_inst.get_nodeattr("weightDataType")].bitwidth() + is_versal = self._is_versal(self.fpga_part) + is_dsp48 = act_width < 5 and weight_width < 5 or not(is_versal) + preferred_folding_dimension = "PE" if is_dsp48 else "SIMD" + preferred_folding_max = max_pe if is_dsp48 else max_simd + second_folding_dimension = "SIMD" if is_dsp48 else "PE" + second_folding_max = max_simd if is_dsp48 else max_pe + for fold_val in divisors(preferred_folding_max): + prev_fold_val = node_inst.get_nodeattr(preferred_folding_dimension) + node_inst.set_nodeattr(preferred_folding_dimension, fold_val) + cyc = node_inst.get_exp_cycles() + if cyc < self.target_cycles_per_frame: + # finish if target met + break + if ( + node_inst.get_weight_datatype().bitwidth() * node_inst.get_nodeattr(preferred_folding_dimension) + > self.mvau_wwidth_max + ): + # revert if we've gone above width threshold + node_inst.set_nodeattr(preferred_folding_dimension, prev_fold_val) + break + # increase SIMD until target met or reached max_simd + self.optimize_attribute_val(node_inst, second_folding_max, second_folding_dimension) elif op_type in pe_ops: max_pe = node_inst.get_nodeattr("NumChannels") self.optimize_attribute_val(node_inst, max_pe, "PE") @@ -156,37 +194,44 @@ def apply(self, model): self.optimize_attribute_val(node_inst, max_pe, "PE") elif op_type in depthwise_op_exceptions: # init/reset SIMD of VVAU - if op_type == "VectorVectorActivation": - node_inst.set_nodeattr("SIMD", 1) + is_hls_vvu_or_pool = op_type in ["VectorVectorActivation", "Pool_Batch"] max_pe = node_inst.get_nodeattr("Channels") - self.optimize_attribute_val(node_inst, max_pe, "PE") - # increase SIMD for VVAU once PE is exhausted - pe = node_inst.get_nodeattr("PE") + max_simd = np.prod(node_inst.get_nodeattr("Kernel")) if op_type.startswith("VectorVectorActivation") else 0 + preferred_folding_dimension = "PE" if is_hls_vvu_or_pool else "SIMD" + preferred_folding_max = max_pe if is_hls_vvu_or_pool else max_simd + second_folding_dimension = "SIMD" if is_hls_vvu_or_pool else "PE" + second_folding_max = max_simd if is_hls_vvu_or_pool else max_pe + if op_type.startswith("VectorVectorActivation"): + node_inst.set_nodeattr(second_folding_dimension, 1) + self.optimize_attribute_val(node_inst, preferred_folding_max, preferred_folding_dimension) + # increase SIMD(/PE) for VVAU once PE(/SIMD) is exhausted + fold_val = node_inst.get_nodeattr(preferred_folding_dimension) cyc = node_inst.get_exp_cycles() if ( - op_type == "VectorVectorActivation" - and pe == max_pe + op_type.startswith("VectorVectorActivation") + and fold_val == preferred_folding_max and cyc > self.target_cycles_per_frame ): - max_simd = np.prod(node_inst.get_nodeattr("Kernel")) - self.optimize_attribute_val(node_inst, max_simd, "SIMD") - # also set the folding of the upsteam DW SWU + self.optimize_attribute_val(node_inst, second_folding_max, second_folding_dimension) + # also set the folding of the upsteam DW SWU (in case of HLS-based VVU) # which must be identical to this node swu_node = model.find_producer(node.input[0]) if swu_node.op_type.startswith("ConvolutionInputGenerator"): swu_node_inst = getCustomOp(swu_node) - swu_node_inst.set_nodeattr("SIMD", pe) # enable parallel_window mode of RTL SWG if needed if swu_node.op_type == "ConvolutionInputGenerator_rtl": if ( - op_type == "VectorVectorActivation" + op_type.startswith("VectorVectorActivation") and node_inst.get_nodeattr("SIMD") > 1 ): swu_node_inst.set_nodeattr("parallel_window", 1) + swu_node_inst.set_nodeattr("SIMD", max_pe) else: swu_node_inst.set_nodeattr("parallel_window", 0) + pe = node_inst.get_nodeattr("PE") + swu_node_inst.set_nodeattr("SIMD", pe) else: - if op_type == "VectorVectorActivation": + if op_type.startswith("VectorVectorActivation"): ksize = np.prod(node_inst.get_nodeattr("Kernel")) elif op_type == "Pool_Batch": ksize = node_inst.get_nodeattr("KernelSize") From 1814ea08ccdb995107faf54000a0ecdb52c292b1 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 8 Jan 2024 14:57:19 +0000 Subject: [PATCH 105/123] [mvu axi]: update list of deduced parameters --- finn-rtllib/mvu/mvu_vvu_axi.sv | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv index 20be83910a..f2b030342b 100644 --- a/finn-rtllib/mvu/mvu_vvu_axi.sv +++ b/finn-rtllib/mvu/mvu_vvu_axi.sv @@ -60,13 +60,14 @@ module mvu_vvu_axi #( bit M_REG_LUT = 1, // Safely deducible parameters - localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8, - localparam int unsigned INPUT_STREAM_WIDTH_BA = ((IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8, - localparam int unsigned WEIGHT_STREAM_WIDTH = PE*SIMD*WEIGHT_WIDTH, - localparam int unsigned INPUT_STREAM_WIDTH = (IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH, - localparam int unsigned SF = MW/SIMD, - localparam int unsigned NF = IS_MVU ? MH/PE : 1, - localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8 + localparam int unsigned WEIGHT_STREAM_WIDTH = PE * SIMD * WEIGHT_WIDTH, + localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (WEIGHT_STREAM_WIDTH + 7) / 8 * 8, + localparam int unsigned INPUT_STREAM_WIDTH = (IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH, + localparam int unsigned INPUT_STREAM_WIDTH_BA = (INPUT_STREAM_WIDTH + 7) / 8 * 8, + localparam int unsigned OUTPUT_STREAM_WIDTH = PE * ACCU_WIDTH, + localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (OUTPUT_STREAM_WIDTH + 7) / 8 * 8, + localparam int unsigned SF = MW / SIMD, + localparam int unsigned NF = IS_MVU ? MH / PE : 1 ) ( // Global Control From f939c3e845b75bd940f4f2b2453b416c07a28457 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 11 Jan 2024 14:43:46 +0000 Subject: [PATCH 106/123] [mvu vvu axi]: reworked flow control and backpressure handling by tpreusser --- finn-rtllib/mvu/mvu_vvu_axi.sv | 45 ++++++++++++++-------------------- 1 file changed, 18 insertions(+), 27 deletions(-) diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv index f2b030342b..0168f20563 100644 --- a/finn-rtllib/mvu/mvu_vvu_axi.sv +++ b/finn-rtllib/mvu/mvu_vvu_axi.sv @@ -62,12 +62,12 @@ module mvu_vvu_axi #( // Safely deducible parameters localparam int unsigned WEIGHT_STREAM_WIDTH = PE * SIMD * WEIGHT_WIDTH, localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (WEIGHT_STREAM_WIDTH + 7) / 8 * 8, - localparam int unsigned INPUT_STREAM_WIDTH = (IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH, + localparam int unsigned INPUT_STREAM_WIDTH = SIMD * ACTIVATION_WIDTH, localparam int unsigned INPUT_STREAM_WIDTH_BA = (INPUT_STREAM_WIDTH + 7) / 8 * 8, localparam int unsigned OUTPUT_STREAM_WIDTH = PE * ACCU_WIDTH, localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (OUTPUT_STREAM_WIDTH + 7) / 8 * 8, localparam int unsigned SF = MW / SIMD, - localparam int unsigned NF = IS_MVU ? MH / PE : 1 + localparam int unsigned NF = MH / PE ) ( // Global Control @@ -151,37 +151,28 @@ module mvu_vvu_axi #( assign ardy = en && s_axis_weights_tvalid; assign s_axis_weights_tready = en && avld; -//-------------------- Core MVU/VVU --------------------\\ - uwire ovld; - uwire [PE-1:0][ACCU_WIDTH-1:0] odat; - uwire mvauin_t amvau_i; - typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t; - - if (IS_MVU) begin : genMVUInput - assign amvau_i = amvau; - end : genMVUInput - else begin : genVVUInput - // The input stream will have the channels interleaved for VVU when PE>1 - // Hence, we need to 'untangle' the input stream, i.e. [..][SIMD*PE][..] --> [..][PE][SIMD][..] - // Note that for each 'SIMD' (S) and 'PE' (P) element, we have something like: - // (S_0, P_0), ..., (S_0, P_i), (S_1, P_0), ..., (S_1, P_i), ..., (S_i, P_i) which we need to 'untangle' to - // (S_0, P_0), ..., (S_i, P_0), (S_0, P_1), ..., (S_i,, P_1), ..., (S_i, P_i) - localparam int num_of_elements = PE*SIMD; - for (genvar i=0; i 1) ? - amvau[(i/SIMD + (i*PE % num_of_elements) + 1) * ACTIVATION_WIDTH -1: (i/SIMD + (i*PE % num_of_elements)) * ACTIVATION_WIDTH] - : amvau[i*ACTIVATION_WIDTH +: ACTIVATION_WIDTH]; - end : genRewire - end : genVVUInput + //- Instantiate compute core ---------------------------- + typedef logic [PE-1:0][ACCU_WIDTH-1:0] dsp_p_t; + uwire dsp_vld; + uwire dsp_p_t dsp_p; + + uwire dsp_clk = ap_clk; + uwire dsp_en = en; + uwire dsp_last = alast && avld; + uwire dsp_zero = !istb; + uwire mvu_w_t dsp_w = mvu_w; + uwire mvu_a_t dsp_a = mvu_a; + uwire ovld = dsp_vld; + uwire dsp_p_t odat = dsp_p; case(COMPUTE_CORE) "mvu_vvu_8sx9_dsp58": mvu_vvu_8sx9_dsp58 #(.IS_MVU(IS_MVU), .PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( - .clk, .rst, .en, - .last(alast && avld), .zero(!istb), .w(s_axis_weights_tdata), .a(amvau_i), - .vld(ovld), .p(odat) + .clk(dsp_clk), .rst, .en(dsp_en), + .last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a), + .vld(dsp_vld), .p(dsp_p) ); "mvu_4sx4u": mvu_4sx4u #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( From ef12de1a86111cfab783640cd3a2a835de2791fe Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Tue, 6 Feb 2024 14:13:19 +0000 Subject: [PATCH 107/123] [mvu/vvu axi]: picked out modifications from another branch to enable VVU --- finn-rtllib/mvu/mvu_vvu_axi.sv | 33 ++++++++++++++++++++++++++------- 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv index 0168f20563..3affe4bb7b 100644 --- a/finn-rtllib/mvu/mvu_vvu_axi.sv +++ b/finn-rtllib/mvu/mvu_vvu_axi.sv @@ -62,7 +62,7 @@ module mvu_vvu_axi #( // Safely deducible parameters localparam int unsigned WEIGHT_STREAM_WIDTH = PE * SIMD * WEIGHT_WIDTH, localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (WEIGHT_STREAM_WIDTH + 7) / 8 * 8, - localparam int unsigned INPUT_STREAM_WIDTH = SIMD * ACTIVATION_WIDTH, + localparam int unsigned INPUT_STREAM_WIDTH = (IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH, localparam int unsigned INPUT_STREAM_WIDTH_BA = (INPUT_STREAM_WIDTH + 7) / 8 * 8, localparam int unsigned OUTPUT_STREAM_WIDTH = PE * ACCU_WIDTH, localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (OUTPUT_STREAM_WIDTH + 7) / 8 * 8, @@ -125,25 +125,44 @@ module mvu_vvu_axi #( uwire rst = !ap_rst_n; //- Replay to Accommodate Neuron Fold ----------------------------------- - typedef logic [PE*SIMD-1:0][ACTIVATION_WIDTH-1:0] mvu_flatin_t; + typedef logic [(IS_MVU ? 1 : PE)*SIMD-1:0][ACTIVATION_WIDTH-1:0] mvu_flatin_t; uwire mvu_flatin_t amvau; uwire alast; uwire afin; uwire avld; uwire ardy; - replay_buffer #(.LEN(SF), .REP(NF), .W($bits(mvu_flatin_t))) activation_replay ( + replay_buffer #(.LEN(SF), .REP(IS_MVU ? NF : 1), .W($bits(mvu_flatin_t))) activation_replay ( .clk, .rst, .ivld(s_axis_input_tvalid), .irdy(s_axis_input_tready), .idat(mvu_flatin_t'(s_axis_input_tdata)), .ovld(avld), .ordy(ardy), .odat(amvau), .olast(alast), .ofin(afin) ); //- Unflatten inputs into structured matrices --------------------------- - typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH -1:0] mvu_w_t; - typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] mvu_a_t; - + localparam int unsigned ACT_PE = IS_MVU? 1 : PE; + typedef logic [PE -1:0][SIMD-1:0][WEIGHT_WIDTH -1:0] mvu_w_t; + typedef logic [ACT_PE-1:0][SIMD-1:0][ACTIVATION_WIDTH-1:0] mvu_a_t; + + //- Conditional Activations Layout Adjustment for VVU + uwire mvu_a_t amvau_i; + if (IS_MVU || (PE == 1)) begin : genMVUInput + assign amvau_i = amvau; + end : genMVUInput + else begin : genVVUInput + // The input stream will have the channels interleaved for VVU when PE>1 + // Hence, we need to 'untangle' the input stream, i.e. [..][SIMD*PE][..] --> [..][PE][SIMD][..] + // Note that for each 'SIMD' (S) and 'PE' (P) element, we have something like: + // (S_0, P_0), ..., (S_0, P_i), (S_1, P_0), ..., (S_1, P_i), ..., (S_i, P_i) which we need to 'untangle' to + // (S_0, P_0), ..., (S_i, P_0), (S_0, P_1), ..., (S_i, P_1), ..., (S_i, P_i) + for(genvar pe = 0; pe < ACT_PE; pe++) begin + for(genvar simd = 0; simd < SIMD; simd++) begin + assign amvau_i[pe][simd] = amvau[simd*ACT_PE+pe]; + end + end + end : genVVUInput + uwire mvu_w_t mvu_w = s_axis_weights_tdata; - uwire mvu_a_t mvu_a = amvau; + uwire mvu_a_t mvu_a = amvau_i; //- Flow Control Bracket around Compute Core ---------------------------- uwire en; From 3d49ab5a204b0428420ca171ff7aba3b89b52cb9 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Tue, 6 Feb 2024 14:28:40 +0000 Subject: [PATCH 108/123] [mvu test]: cleaned up test --- finn-rtllib/mvu/tb/mvu_axi_tb.sv | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/finn-rtllib/mvu/tb/mvu_axi_tb.sv b/finn-rtllib/mvu/tb/mvu_axi_tb.sv index 8614e9f811..62aa0919f4 100644 --- a/finn-rtllib/mvu/tb/mvu_axi_tb.sv +++ b/finn-rtllib/mvu/tb/mvu_axi_tb.sv @@ -35,13 +35,13 @@ module mvu_axi_tb(); //-------------------- Simulation parameters --------------------\\ // Matrix & parallelism config - localparam bit IS_MVU = 0; + localparam bit IS_MVU = 1; localparam string COMPUTE_CORE = "mvu_vvu_8sx9_dsp58"; localparam int unsigned MW = 36; localparam int unsigned MH = 4; - localparam int unsigned SIMD = 36; - localparam int unsigned PE = 4; - localparam int unsigned SEGMENTLEN = 2.0; + localparam int unsigned SIMD = 9; + localparam int unsigned PE = 2; + localparam int unsigned SEGMENTLEN = 1.0; localparam bit FORCE_BEHAVIORAL = 1; localparam bit M_REG_LUT = 1; // Bit-width config @@ -156,16 +156,6 @@ module mvu_axi_tb(); function output_vector_t check_output(activation_vector_t a, weight_matrix_t w); automatic output_vector_t res = '{default: 0}; - // for (int j = 0; j 1 ? $signed(a[i/SIMD/PE][i % (SIMD*PE)]) : $signed(a[i/SIMD/PE][(i)%(SIMD*PE)]) ) * $signed(w[0][i/SIMD/PE][i/PE][i%SIMD]); - // else - // res[j/PE][j%PE] = IS_MVU ? $signed(res[j/PE][j%PE]) + $signed({1'b0, a[i/SIMD][i%SIMD]}) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]) : - // $signed(res[j/PE][j%PE]) + ( PE > 1 ? $signed({1'b0, a[i/SIMD/PE][i % (SIMD*PE)]}) : $signed({1'b0, a[i/SIMD/PE][i%(SIMD*PE)]}) ) * $signed(w[0][i/SIMD][0][i%SIMD]); - // end - // end // The input stream will have the channels interleaved for VVU when PE>1 // Hence, we need to 'untangle' the input stream, i.e. [..][SIMD*PE][..] --> [..][PE][SIMD][..] // Note that for each 'SIMD' (S) and 'PE' (P) element, we have something like: From 105ae6fde79d9b7eca17f23fc3f7c80b0db51f6f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= Date: Wed, 24 May 2023 07:58:41 +0100 Subject: [PATCH 109/123] Revised control interface attributes. --- finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v | 93 ++++++++++++++++++++++++++ finn-rtllib/mvu/mvu_vvu_axi_wrapper.v | 2 +- 2 files changed, 94 insertions(+), 1 deletion(-) create mode 100644 finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v diff --git a/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v b/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v new file mode 100644 index 0000000000..e15f77fbae --- /dev/null +++ b/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v @@ -0,0 +1,93 @@ +/****************************************************************************** + * Copyright (C) 2022, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Verilog AXI-lite wrapper for MVU. + *****************************************************************************/ + +module $MODULE_NAME_AXI_WRAPPER$ #( + parameter MW = $MW$, + parameter MH = $MH$, + parameter PE = $PE$, + parameter SIMD = $SIMD$, + parameter ACTIVATION_WIDTH = $ACTIVATION_WIDTH$, + parameter WEIGHT_WIDTH = $WEIGHT_WIDTH$, + parameter ACCU_WIDTH = $ACCU_WIDTH$, + parameter SIGNED_ACTIVATIONS = $SIGNED_ACTIVATIONS$, + parameter SEGMENTLEN = $SEGMENTLEN$, + parameter RAM_STYLE = "$IBUF_RAM_STYLE$", + + // Safely deducible parameters + parameter WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8, + parameter INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8, + parameter OUTPUT_LANES = PE, + parameter OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8 +)( + // Global Control + (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF s_axis_weights:s_axis_input:m_axis_output, ASSOCIATED_RESET ap_rst_n" *) + (* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk CLK" *) + input ap_clk, + (* X_INTERFACE_PARAMETER = "POLARITY ACTIVE_LOW" *) + input ap_rst_n, + + // Weight Stream + input [WEIGHT_STREAM_WIDTH_BA-1:0] s_axis_weights_tdata, + input s_axis_weights_tvalid, + output s_axis_weights_tready, + + // Input Stream + input [INPUT_STREAM_WIDTH_BA-1:0] s_axis_input_tdata, + input s_axis_input_tvalid, + output s_axis_input_tready, + + // Output Stream + output [OUTPUT_STREAM_WIDTH_BA-1:0] m_axis_output_tdata, + output m_axis_output_tvalid, + input m_axis_output_tready +); + +mvu_8sx9_axi #( + .MW(MW), .MH(MH), .PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), + .WEIGHT_WIDTH(WEIGHT_WIDTH), .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), + .SEGMENTLEN(SEGMENTLEN), .RAM_STYLE(RAM_STYLE) + ) inst ( + .ap_clk(ap_clk), + .ap_rst_n(ap_rst_n), + .s_axis_weights_tdata(s_axis_weights_tdata), + .s_axis_weights_tvalid(s_axis_weights_tvalid), + .s_axis_weights_tready(s_axis_weights_tready), + .s_axis_input_tdata(s_axis_input_tdata), + .s_axis_input_tvalid(s_axis_input_tvalid), + .s_axis_input_tready(s_axis_input_tready), + .m_axis_output_tdata(m_axis_output_tdata), + .m_axis_output_tvalid(m_axis_output_tvalid), + .m_axis_output_tready(m_axis_output_tready) +); + +endmodule : $MODULE_NAME_AXI_WRAPPER$ diff --git a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v index 01deb23840..99178880f7 100644 --- a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v +++ b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v @@ -89,4 +89,4 @@ mvu_vvu_axi #( .m_axis_output_tready(out_V_TREADY) ); -endmodule // $MODULE_NAME_AXI_WRAPPER$ +endmodule : $MODULE_NAME_AXI_WRAPPER$ From 936ef69bb868d2702472d7c1a6c3767a11263cf4 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Wed, 24 May 2023 15:49:19 +0100 Subject: [PATCH 110/123] [rtl mvu]: extension to allow selecting PE values that are not multiples of 4 --- finn-rtllib/mvu/mvu_4sx4u.sv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/finn-rtllib/mvu/mvu_4sx4u.sv b/finn-rtllib/mvu/mvu_4sx4u.sv index b49315637f..304637dd31 100644 --- a/finn-rtllib/mvu/mvu_4sx4u.sv +++ b/finn-rtllib/mvu/mvu_4sx4u.sv @@ -440,7 +440,7 @@ module mvu_4sx4u #( end assign hi4[i] = Hi4; end : genHi - else if (i < 3) begin : genHiZero + else begin : genHiZero assign hi4[i] = '0; end : genHiZero From 9bf7e33408b84a7facaf0c0785eef5c5f053bfea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= Date: Fri, 29 Sep 2023 15:24:28 +0100 Subject: [PATCH 111/123] Starting on pumped DSP compute. --- finn-rtllib/mvu/mvu_vvu_axi.sv | 224 ++++++++++++++++++++++++++------- 1 file changed, 182 insertions(+), 42 deletions(-) diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv index 3affe4bb7b..1b690195f3 100644 --- a/finn-rtllib/mvu/mvu_vvu_axi.sv +++ b/finn-rtllib/mvu/mvu_vvu_axi.sv @@ -51,27 +51,27 @@ module mvu_vvu_axi #( int unsigned MH, int unsigned PE, int unsigned SIMD, + int unsigned SEGMENTLEN = 0, + int unsigned ACTIVATION_WIDTH, int unsigned WEIGHT_WIDTH, int unsigned ACCU_WIDTH, bit SIGNED_ACTIVATIONS = 0, - int unsigned SEGMENTLEN = 0, + + bit PUMPED_COMPUTE = 0, // requires an even SIMD % 2 == 0 bit FORCE_BEHAVIORAL = 0, bit M_REG_LUT = 1, // Safely deducible parameters - localparam int unsigned WEIGHT_STREAM_WIDTH = PE * SIMD * WEIGHT_WIDTH, - localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (WEIGHT_STREAM_WIDTH + 7) / 8 * 8, - localparam int unsigned INPUT_STREAM_WIDTH = (IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH, - localparam int unsigned INPUT_STREAM_WIDTH_BA = (INPUT_STREAM_WIDTH + 7) / 8 * 8, - localparam int unsigned OUTPUT_STREAM_WIDTH = PE * ACCU_WIDTH, - localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (OUTPUT_STREAM_WIDTH + 7) / 8 * 8, - localparam int unsigned SF = MW / SIMD, - localparam int unsigned NF = MH / PE -) -( + localparam int unsigned WEIGHT_STREAM_WIDTH = PE * SIMD * WEIGHT_WIDTH, + localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (WEIGHT_STREAM_WIDTH + 7)/8 * 8, + localparam int unsigned INPUT_STREAM_WIDTH = (IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH, + localparam int unsigned INPUT_STREAM_WIDTH_BA = (INPUT_STREAM_WIDTH + 7)/8 * 8, + localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8 +)( // Global Control input logic ap_clk, + input logic ap_clk2x, // only used when PUMPED_COMPUTE input logic ap_rst_n, // Weight Stream @@ -119,23 +119,39 @@ module mvu_vvu_axi #( $finish; end end + if (!IS_MVU) begin + if (COMPUTE_CORE != "mvu_vvu_8sx9_dsp58" && COMPUTE_CORE != "mvu_vvu_lut") begin + $error("VVU only supported on DSP58 or LUT-based implementation"); + $finish; + end + end + + //- Pumping Constraints --------- + if(PUMPED_COMPUTE) begin + if(SIMD % 2 != 0) begin + $error("Odd SIMD=%0d is incompatible with pumped compute.", SIMD); + $finish; + end + end end uwire clk = ap_clk; uwire rst = !ap_rst_n; //- Replay to Accommodate Neuron Fold ----------------------------------- - typedef logic [(IS_MVU ? 1 : PE)*SIMD-1:0][ACTIVATION_WIDTH-1:0] mvu_flatin_t; + typedef logic [(IS_MVU? 1:PE)*SIMD-1:0][ACTIVATION_WIDTH-1:0] mvu_flatin_t; uwire mvu_flatin_t amvau; uwire alast; uwire afin; uwire avld; uwire ardy; + localparam int unsigned SF = MW/SIMD; + localparam int unsigned NF = MH/PE; replay_buffer #(.LEN(SF), .REP(IS_MVU ? NF : 1), .W($bits(mvu_flatin_t))) activation_replay ( - .clk, .rst, - .ivld(s_axis_input_tvalid), .irdy(s_axis_input_tready), .idat(mvu_flatin_t'(s_axis_input_tdata)), - .ovld(avld), .ordy(ardy), .odat(amvau), .olast(alast), .ofin(afin) + .clk, .rst, + .ivld(s_axis_input_tvalid), .irdy(s_axis_input_tready), .idat(mvu_flatin_t'(s_axis_input_tdata)), + .ovld(avld), .ordy(ardy), .odat(amvau), .olast(alast), .ofin(afin) ); //- Unflatten inputs into structured matrices --------------------------- @@ -143,6 +159,8 @@ module mvu_vvu_axi #( typedef logic [PE -1:0][SIMD-1:0][WEIGHT_WIDTH -1:0] mvu_w_t; typedef logic [ACT_PE-1:0][SIMD-1:0][ACTIVATION_WIDTH-1:0] mvu_a_t; + uwire mvu_w_t mvu_w = s_axis_weights_tdata; + //- Conditional Activations Layout Adjustment for VVU uwire mvu_a_t amvau_i; if (IS_MVU || (PE == 1)) begin : genMVUInput @@ -184,33 +202,155 @@ module mvu_vvu_axi #( uwire ovld = dsp_vld; uwire dsp_p_t odat = dsp_p; - case(COMPUTE_CORE) - "mvu_vvu_8sx9_dsp58": - mvu_vvu_8sx9_dsp58 #(.IS_MVU(IS_MVU), .PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), - .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN), - .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( - .clk(dsp_clk), .rst, .en(dsp_en), - .last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a), - .vld(dsp_vld), .p(dsp_p) - ); - "mvu_4sx4u": - mvu_4sx4u #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( - .clk(dsp_clk), .rst, .en(dsp_en), - .last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a), - .vld(dsp_vld), .p(dsp_p) - ); - "mvu_8sx8u_dsp48": - mvu_8sx8u_dsp48 #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), - .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( - .clk(dsp_clk), .rst, .en(dsp_en), - .last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a), - .vld(dsp_vld), .p(dsp_p) - ); - default: initial begin - $error("Unrecognized COMPUTE_CORE '%s'", COMPUTE_CORE); - $finish; - end - endcase + //- Flow Control Bracket around Compute Core ---------------------------- + uwire en; + uwire istb = avld && s_axis_weights_tvalid; + assign ardy = en && s_axis_weights_tvalid; + assign s_axis_weights_tready = en && avld; + + //- Conditionally Pumped DSP Compute ------------------------------------ + typedef logic [PE-1:0][ACCU_WIDTH-1:0] dsp_p_t; + uwire ovld; + uwire dsp_p_t odat; + if(1) begin : blkDsp + localparam int unsigned DSP_SIMD = SIMD/(PUMPED_COMPUTE+1); + typedef logic [PE -1:0][DSP_SIMD-1:0][WEIGHT_WIDTH -1:0] dsp_w_t; + typedef logic [ACT_PE-1:0][DSP_SIMD-1:0][ACTIVATION_WIDTH-1:0] dsp_a_t; + + uwire dsp_clk; + uwire dsp_en; + + uwire dsp_last; + uwire dsp_zero; + uwire dsp_w_t dsp_w; + uwire dsp_a_t dsp_a; + + uwire dsp_vld; + uwire dsp_p_t dsp_p; + + if(!PUMPED_COMPUTE) begin : genUnpumpedCompute + assign dsp_clk = clk; + assign dsp_en = en; + + assign dsp_last = alast && avld; + assign dsp_zero = !istb; + assign dsp_w = mvu_w; + assign dsp_a = amvau_i; + + assign ovld = dsp_vld; + assign odat = dsp_p; + end : genUnpumpedCompute + else begin : genPumpedCompute + assign dsp_clk = clk2x; + + // Identify second fast cycle before active slow clock edge + logic Active = 0; + always_ff @(posedge clk2x) Active <= clk; + + // The input for a slow cycle is split across two fast cycles along the SIMD dimension. + // - Both fast cycles are controlled by the same enable state. + // - A zero cycle is duplicated across both fast cycles. + // - The last flag must be restricted to the second fast cycle. + logic En = 0; + logic Last[1:0] = '{ default: 1'b0 }; + logic Zero = 1; + dsp_w_t W[1:0] = '{ default: 'x }; + dsp_a_t A[1:0] = '{ default: 'x }; + always_ff @(posedge clk2x) begin + if(rst) begin + En <= 0; + Last <= '{ default: 1'b0 }; + Zero <= 1; + W <= '{ default: 'x }; + A <= '{ default: 'x }; + end + else begin + if(Active) begin + En <= en; + if(en) begin + Last <= '{ alast && avld, 1'b0 }; + Zero <= !istb; + for(int unsigned simd = 0; simd < SIMD; simd++) begin + for(int unsigned pe = 0; pe < PE; pe++) begin + W[simd / DSP_SIMD][pe][simd % DSP_SIMD] <= mvu_w[pe][simd]; + end + for(int unsigned pe = 0; pe < ACT_PE; pe++) begin + A[simd / DSP_SIMD][pe][simd % DSP_SIMD] <= amvau_i[pe][simd]; + end + end + end + end + else if(En) begin + Last <= '{ 'x, Last[1] }; + W <= '{ 'x, W[1] }; + A <= '{ 'x, A[1] }; + end + end + end + assign dsp_en = En; + + assign dsp_last = Last[0]; + assign dsp_zero = Zero; + assign dsp_w = W[0]; + assign dsp_a = A[0]; + + // Since no two consecutive last cycles will ever be asserted on the input, + // valid outputs will also always be spaced by, at least, one other cycle. + // We can always hold a captured output for two cycles to allow the slow + // clock to pick it up. + logic Vld = 0; + dsp_p_t P = 'x; + always_ff @(posedge clk2x) begin + if(rst) begin + Vld <= 0; + P <= 'x; + end + else begin + if(dsp_vld) P <= dsp_p; + Vld <= dsp_vld || (Vld && !Active); + end + end + assign ovld = Vld; + assign odat = P; + + end : genPumpedCompute + + case(COMPUTE_CORE) + "mvu_vvu_8sx9_dsp58": + mvu_vvu_8sx9_dsp58 #(.IS_MVU(IS_MVU), .PE(PE), .SIMD(DSP_SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), + .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN), + .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( + .clk(dsp_clk), .rst, .en(dsp_en), + .last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a), + .vld(dsp_vld), .p(dsp_p) + ); + "mvu_4sx4u": + mvu_4sx4u #(.PE(PE), .SIMD(DSP_SIMD), .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( + .clk(dsp_clk), .rst, .en(dsp_en), + .last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a), + .vld(dsp_vld), .p(dsp_p) + ); + "mvu_8sx8u_dsp48": + mvu_8sx8u_dsp48 #(.PE(PE), .SIMD(DSP_SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), + .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( + .clk(dsp_clk), .rst, .en(dsp_en), + .last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a), + .vld(dsp_vld), .p(dsp_p) + ); + "mvu_vvu_lut": + mvu_vvu_lut #(.IS_MVU(IS_MVU), .PE(PE), .SIMD(DSP_SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), + .WEIGHT_WIDTH(WEIGHT_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .M_REG(M_REG_LUT)) core ( + .clk(dsp_clk), .rst, .en(dsp_en), + .last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a), + .vld(dsp_vld), .p(dsp_p) + ); + default: initial begin + $error("Unrecognized COMPUTE_CORE '%s'", COMPUTE_CORE); + $finish; + end + endcase + + end : blkDsp //-------------------- Output register slice --------------------\\ // Make `en`computation independent from external inputs. From 80a5510cdb06b443c9d71cc5180f1c8bc6569886 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 8 Feb 2024 14:00:19 +0000 Subject: [PATCH 112/123] pulled latest changes related to double-pumping --- finn-rtllib/mvu/mvu_4sx4u.sv | 2 +- finn-rtllib/mvu/mvu_vvu_axi.sv | 136 ++++++++++++------------ finn-rtllib/mvu/mvu_vvu_axi_wrapper.v | 9 +- finn-rtllib/mvu/tb/mvu_axi_tb.sv | 18 +++- finn-rtllib/mvu/tb/mvu_dsp58_tb.sv | 142 ++++++++++++++++++++++++++ 5 files changed, 227 insertions(+), 80 deletions(-) create mode 100644 finn-rtllib/mvu/tb/mvu_dsp58_tb.sv diff --git a/finn-rtllib/mvu/mvu_4sx4u.sv b/finn-rtllib/mvu/mvu_4sx4u.sv index 304637dd31..b49315637f 100644 --- a/finn-rtllib/mvu/mvu_4sx4u.sv +++ b/finn-rtllib/mvu/mvu_4sx4u.sv @@ -440,7 +440,7 @@ module mvu_4sx4u #( end assign hi4[i] = Hi4; end : genHi - else begin : genHiZero + else if (i < 3) begin : genHiZero assign hi4[i] = '0; end : genHiZero diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv index 1b690195f3..d40c5e1b10 100644 --- a/finn-rtllib/mvu/mvu_vvu_axi.sv +++ b/finn-rtllib/mvu/mvu_vvu_axi.sv @@ -67,11 +67,13 @@ module mvu_vvu_axi #( localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (WEIGHT_STREAM_WIDTH + 7)/8 * 8, localparam int unsigned INPUT_STREAM_WIDTH = (IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH, localparam int unsigned INPUT_STREAM_WIDTH_BA = (INPUT_STREAM_WIDTH + 7)/8 * 8, - localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8 + localparam int unsigned OUTPUT_STREAM_WIDTH = PE*ACCU_WIDTH, + localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (OUTPUT_STREAM_WIDTH + 7)/8 * 8, + localparam bit SIMD_UNEVEN = SIMD % 2 )( // Global Control input logic ap_clk, - input logic ap_clk2x, // only used when PUMPED_COMPUTE + input logic ap_clk2x, // synchronous, double-speed clock; only used for PUMPED_COMPUTE input logic ap_rst_n, // Weight Stream @@ -126,17 +128,18 @@ module mvu_vvu_axi #( end end - //- Pumping Constraints --------- - if(PUMPED_COMPUTE) begin - if(SIMD % 2 != 0) begin - $error("Odd SIMD=%0d is incompatible with pumped compute.", SIMD); - $finish; - end - end + // //- Pumping Constraints --------- + // if(PUMPED_COMPUTE) begin + // if(SIMD % 2 != 0) begin + // $error("Odd SIMD=%0d is incompatible with pumped compute.", SIMD); + // $finish; + // end + // end end - uwire clk = ap_clk; - uwire rst = !ap_rst_n; + uwire clk = ap_clk; + uwire clk2x = ap_clk2x; + uwire rst = !ap_rst_n; //- Replay to Accommodate Neuron Fold ----------------------------------- typedef logic [(IS_MVU? 1:PE)*SIMD-1:0][ACTIVATION_WIDTH-1:0] mvu_flatin_t; @@ -178,29 +181,6 @@ module mvu_vvu_axi #( end end end : genVVUInput - - uwire mvu_w_t mvu_w = s_axis_weights_tdata; - uwire mvu_a_t mvu_a = amvau_i; - - //- Flow Control Bracket around Compute Core ---------------------------- - uwire en; - uwire istb = avld && s_axis_weights_tvalid; - assign ardy = en && s_axis_weights_tvalid; - assign s_axis_weights_tready = en && avld; - - //- Instantiate compute core ---------------------------- - typedef logic [PE-1:0][ACCU_WIDTH-1:0] dsp_p_t; - uwire dsp_vld; - uwire dsp_p_t dsp_p; - - uwire dsp_clk = ap_clk; - uwire dsp_en = en; - uwire dsp_last = alast && avld; - uwire dsp_zero = !istb; - uwire mvu_w_t dsp_w = mvu_w; - uwire mvu_a_t dsp_a = mvu_a; - uwire ovld = dsp_vld; - uwire dsp_p_t odat = dsp_p; //- Flow Control Bracket around Compute Core ---------------------------- uwire en; @@ -213,7 +193,8 @@ module mvu_vvu_axi #( uwire ovld; uwire dsp_p_t odat; if(1) begin : blkDsp - localparam int unsigned DSP_SIMD = SIMD/(PUMPED_COMPUTE+1); + localparam int unsigned EFFECTIVE_SIMD = SIMD_UNEVEN && PUMPED_COMPUTE ? SIMD+1 : SIMD; + localparam int unsigned DSP_SIMD = EFFECTIVE_SIMD/(PUMPED_COMPUTE+1); typedef logic [PE -1:0][DSP_SIMD-1:0][WEIGHT_WIDTH -1:0] dsp_w_t; typedef logic [ACT_PE-1:0][DSP_SIMD-1:0][ACTIVATION_WIDTH-1:0] dsp_a_t; @@ -243,56 +224,66 @@ module mvu_vvu_axi #( else begin : genPumpedCompute assign dsp_clk = clk2x; - // Identify second fast cycle before active slow clock edge + // Identify second fast cycle just before active slow clock edge logic Active = 0; - always_ff @(posedge clk2x) Active <= clk; + if(1) begin : blkActive + uwire clk_lut[2]; // Put some LUT delay on the input from the fast clock net + (* DONT_TOUCH = "TRUE", HLUTNM = "CLK_LUT" *) LUT1 #(.INIT(2'b10)) lut0(.O(clk_lut[0]), .I0(clk)); + (* DONT_TOUCH = "TRUE", HLUTNM = "CLK_LUT" *) LUT1 #(.INIT(2'b10)) lut1(.O(clk_lut[1]), .I0(clk_lut[0])); + always_ff @(posedge clk2x) Active <= clk_lut[1]; + end : blkActive // The input for a slow cycle is split across two fast cycles along the SIMD dimension. // - Both fast cycles are controlled by the same enable state. // - A zero cycle is duplicated across both fast cycles. // - The last flag must be restricted to the second fast cycle. - logic En = 0; - logic Last[1:0] = '{ default: 1'b0 }; + + dsp_w_t W = 'x; + for(genvar pe = 0; pe < PE; pe++) begin : genPERegW + + uwire [2*DSP_SIMD-1:0][WEIGHT_WIDTH-1:0] w; + for(genvar i = 0; i < SIMD; i++) assign w[i] = mvu_w[pe][i]; + for(genvar i = SIMD; i < 2*DSP_SIMD; i++) assign w[i] = 0; + + always_ff @(posedge clk2x) begin + if(rst) W[pe] <= 'x; + else if(en) W[pe] <= w[(Active? DSP_SIMD : 0) +: DSP_SIMD]; + end + + end : genPERegW + + dsp_a_t A = 'x; + for(genvar pe = 0; pe < ACT_PE; pe++) begin : genPERegA + + uwire [2*DSP_SIMD-1:0][ACTIVATION_WIDTH-1:0] a; + for(genvar i = 0; i < SIMD; i++) assign a[i] = amvau_i[pe][i]; + for(genvar i = SIMD; i < 2*DSP_SIMD; i++) assign a[i] = 0; + + always_ff @(posedge clk2x) begin + if(rst) A[pe] <= 'x; + else if(en) A[pe] <= a[(Active? DSP_SIMD : 0) +: DSP_SIMD]; + end + + end : genPERegA + logic Zero = 1; - dsp_w_t W[1:0] = '{ default: 'x }; - dsp_a_t A[1:0] = '{ default: 'x }; + logic Last = 0; always_ff @(posedge clk2x) begin if(rst) begin - En <= 0; - Last <= '{ default: 1'b0 }; - Zero <= 1; - W <= '{ default: 'x }; - A <= '{ default: 'x }; + Zero <= 1; + Last <= 0; end - else begin - if(Active) begin - En <= en; - if(en) begin - Last <= '{ alast && avld, 1'b0 }; - Zero <= !istb; - for(int unsigned simd = 0; simd < SIMD; simd++) begin - for(int unsigned pe = 0; pe < PE; pe++) begin - W[simd / DSP_SIMD][pe][simd % DSP_SIMD] <= mvu_w[pe][simd]; - end - for(int unsigned pe = 0; pe < ACT_PE; pe++) begin - A[simd / DSP_SIMD][pe][simd % DSP_SIMD] <= amvau_i[pe][simd]; - end - end - end - end - else if(En) begin - Last <= '{ 'x, Last[1] }; - W <= '{ 'x, W[1] }; - A <= '{ 'x, A[1] }; - end + else if(en) begin + Zero <= !istb; + Last <= alast && avld && Active; end end - assign dsp_en = En; - assign dsp_last = Last[0]; + assign dsp_en = en; + assign dsp_last = Last; assign dsp_zero = Zero; - assign dsp_w = W[0]; - assign dsp_a = A[0]; + assign dsp_w = W; + assign dsp_a = A; // Since no two consecutive last cycles will ever be asserted on the input, // valid outputs will also always be spaced by, at least, one other cycle. @@ -305,7 +296,7 @@ module mvu_vvu_axi #( Vld <= 0; P <= 'x; end - else begin + else if(en) begin if(dsp_vld) P <= dsp_p; Vld <= dsp_vld || (Vld && !Active); end @@ -389,5 +380,4 @@ module mvu_vvu_axi #( // These extra bits should never be used. Why not 'x them out? assign m_axis_output_tdata = { {(OUTPUT_STREAM_WIDTH_BA-OUTPUT_STREAM_WIDTH){B.dat[PE-1][ACCU_WIDTH-1]}}, B.dat}; - endmodule : mvu_vvu_axi diff --git a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v index 99178880f7..11949dec24 100644 --- a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v +++ b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v @@ -34,6 +34,7 @@ module $MODULE_NAME_AXI_WRAPPER$ #( parameter IS_MVU = $IS_MVU$, parameter COMPUTE_CORE = "$COMPUTE_CORE$", + parameter PUMPED_COMPUTE = $PUMPED_COMPUTE$, parameter MW = $MW$, parameter MH = $MH$, parameter PE = $PE$, @@ -54,6 +55,9 @@ module $MODULE_NAME_AXI_WRAPPER$ #( (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF weights_V:in0_V:out_V, ASSOCIATED_RESET ap_rst_n" *) (* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk CLK" *) input ap_clk, + (* X_INTERFACE_PARAMETER = "ASSOCIATED_RESET ap_rst_n" *) + (* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk2x CLK" *) + input ap_clk2x, (* X_INTERFACE_PARAMETER = "POLARITY ACTIVE_LOW" *) input ap_rst_n, @@ -72,11 +76,12 @@ module $MODULE_NAME_AXI_WRAPPER$ #( ); mvu_vvu_axi #( - .IS_MVU(IS_MVU), .COMPUTE_CORE(COMPUTE_CORE), .MW(MW), .MH(MH), .PE(PE), .SIMD(SIMD), + .IS_MVU(IS_MVU), .COMPUTE_CORE(COMPUTE_CORE), .PUMPED_COMPUTE(PUMPED_COMPUTE), .MW(MW), .MH(MH), .PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL) ) inst ( .ap_clk(ap_clk), + .ap_clk2x(ap_clk2x), .ap_rst_n(ap_rst_n), .s_axis_weights_tdata(weights_V_TDATA), .s_axis_weights_tvalid(weights_V_TVALID), @@ -89,4 +94,4 @@ mvu_vvu_axi #( .m_axis_output_tready(out_V_TREADY) ); -endmodule : $MODULE_NAME_AXI_WRAPPER$ +endmodule // $MODULE_NAME_AXI_WRAPPER$ diff --git a/finn-rtllib/mvu/tb/mvu_axi_tb.sv b/finn-rtllib/mvu/tb/mvu_axi_tb.sv index 62aa0919f4..8614e9f811 100644 --- a/finn-rtllib/mvu/tb/mvu_axi_tb.sv +++ b/finn-rtllib/mvu/tb/mvu_axi_tb.sv @@ -35,13 +35,13 @@ module mvu_axi_tb(); //-------------------- Simulation parameters --------------------\\ // Matrix & parallelism config - localparam bit IS_MVU = 1; + localparam bit IS_MVU = 0; localparam string COMPUTE_CORE = "mvu_vvu_8sx9_dsp58"; localparam int unsigned MW = 36; localparam int unsigned MH = 4; - localparam int unsigned SIMD = 9; - localparam int unsigned PE = 2; - localparam int unsigned SEGMENTLEN = 1.0; + localparam int unsigned SIMD = 36; + localparam int unsigned PE = 4; + localparam int unsigned SEGMENTLEN = 2.0; localparam bit FORCE_BEHAVIORAL = 1; localparam bit M_REG_LUT = 1; // Bit-width config @@ -156,6 +156,16 @@ module mvu_axi_tb(); function output_vector_t check_output(activation_vector_t a, weight_matrix_t w); automatic output_vector_t res = '{default: 0}; + // for (int j = 0; j 1 ? $signed(a[i/SIMD/PE][i % (SIMD*PE)]) : $signed(a[i/SIMD/PE][(i)%(SIMD*PE)]) ) * $signed(w[0][i/SIMD/PE][i/PE][i%SIMD]); + // else + // res[j/PE][j%PE] = IS_MVU ? $signed(res[j/PE][j%PE]) + $signed({1'b0, a[i/SIMD][i%SIMD]}) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]) : + // $signed(res[j/PE][j%PE]) + ( PE > 1 ? $signed({1'b0, a[i/SIMD/PE][i % (SIMD*PE)]}) : $signed({1'b0, a[i/SIMD/PE][i%(SIMD*PE)]}) ) * $signed(w[0][i/SIMD][0][i%SIMD]); + // end + // end // The input stream will have the channels interleaved for VVU when PE>1 // Hence, we need to 'untangle' the input stream, i.e. [..][SIMD*PE][..] --> [..][PE][SIMD][..] // Note that for each 'SIMD' (S) and 'PE' (P) element, we have something like: diff --git a/finn-rtllib/mvu/tb/mvu_dsp58_tb.sv b/finn-rtllib/mvu/tb/mvu_dsp58_tb.sv new file mode 100644 index 0000000000..108980c497 --- /dev/null +++ b/finn-rtllib/mvu/tb/mvu_dsp58_tb.sv @@ -0,0 +1,142 @@ +module mvu_dsp58_tb; + + localparam int unsigned N = 1000; + + localparam int unsigned MW = 12; + localparam int unsigned MH = 4; + localparam int unsigned PE = 2; + localparam int unsigned SIMD = 6; + localparam int unsigned ACTIVATION_WIDTH = 8; + localparam int unsigned WEIGHT_WIDTH = 8; + localparam int unsigned ACCU_WIDTH = 24; + + //- Global Control ------------------ + logic clk = 1; + logic clk2x = 1; + always #5ns clk = !clk; + always #2.5ns clk2x = !clk2x; + + logic rst = 1; + initial begin + repeat(8) @(posedge clk); + rst <= 0; + end + + //- DUTs ---------------------------- + + // Weight Stream + logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] s_axis_weights_tdata; + logic s_axis_weights_tvalid[2]; + uwire s_axis_weights_tready[2]; + + // Input Stream + logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] s_axis_input_tdata; + logic s_axis_input_tvalid[2]; + uwire s_axis_input_tready[2]; + + // Output Stream + uwire [PE-1:0][ACCU_WIDTH-1:0] m_axis_output_tdata[2]; + uwire m_axis_output_tvalid[2]; + logic m_axis_output_tready[2]; + + for(genvar i = 0; i < 2; i++) begin : genDUTs + mvu_vvu_axi #( + .IS_MVU(1), + .COMPUTE_CORE("mvu_vvu_8sx9_dsp58"), + .MW(MW), .MH(MH), + .PE(PE), .SIMD(SIMD), + .ACTIVATION_WIDTH(ACTIVATION_WIDTH), + .WEIGHT_WIDTH(WEIGHT_WIDTH), + .ACCU_WIDTH(ACCU_WIDTH), + .PUMPED_COMPUTE(i) + ) dut ( + .ap_clk(clk), .ap_clk2x(clk2x), .ap_rst_n(!rst), + .s_axis_weights_tdata, .s_axis_weights_tvalid(s_axis_weights_tvalid[i]), .s_axis_weights_tready(s_axis_weights_tready[i]), + .s_axis_input_tdata, .s_axis_input_tvalid (s_axis_input_tvalid [i]), .s_axis_input_tready (s_axis_input_tready [i]), + .m_axis_output_tdata(m_axis_output_tdata[i]), .m_axis_output_tvalid (m_axis_output_tvalid [i]), .m_axis_output_tready (m_axis_output_tready [i]) + ); + end : genDUTs + + + //- Stimuli ------------------------- + + // Weight Feed + initial begin + s_axis_weights_tvalid = '{ default: 0 }; + s_axis_weights_tdata = 'x; + @(posedge clk iff !rst); + + repeat(N * (MH/PE)*(MW/SIMD)) begin + automatic type(s_axis_weights_tdata) weights; + std::randomize(weights); + s_axis_weights_tdata <= weights; + s_axis_weights_tvalid <= '{ default: 1 }; + fork + begin + @(posedge clk iff s_axis_weights_tready[0]); + s_axis_weights_tvalid[0] <= 0; + end + begin + @(posedge clk iff s_axis_weights_tready[1]); + s_axis_weights_tvalid[1] <= 0; + end + join + end + end + + // Input Feed + initial begin + s_axis_input_tvalid = '{ default: 0 }; + s_axis_input_tdata = 'x; + @(posedge clk iff !rst); + + repeat(N * (MW/SIMD)) begin + automatic type(s_axis_input_tdata) in; + std::randomize(in); + s_axis_input_tdata <= in; + s_axis_input_tvalid <= '{ default: 1 }; + fork + begin + @(posedge clk iff s_axis_input_tready[0]); + s_axis_input_tvalid[0] <= 0; + end + begin + @(posedge clk iff s_axis_input_tready[1]); + s_axis_input_tvalid[1] <= 0; + end + join + end + end + + // Output Capture and Comparison + initial begin + m_axis_output_tready = '{ default: 0 }; + @(posedge clk iff !rst); + + repeat(N * (MH/PE)) begin + automatic type(m_axis_output_tdata) res; + m_axis_output_tready <= '{ default: 1 }; + fork + begin + @(posedge clk iff m_axis_output_tvalid[0]); + m_axis_output_tready[0] <= 0; + res[0] = m_axis_output_tdata[0]; + end + begin + @(posedge clk iff m_axis_output_tvalid[1]); + m_axis_output_tready[1] <= 0; + res[1] = m_axis_output_tdata[1]; + end + join + assert(res[0] == res[1]) else begin + $error("Output mismatch: %0x <=> %0x", res[0], res[1]); + $stop; + end + while($urandom()%7 < MW/SIMD) @(posedge clk); // Occassional backpressure + end + + $display("Test completed."); + $finish; + end + +endmodule : mvu_dsp58_tb From 289749b0c4b0e72fef39d4f7011380571f4b6869 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 8 Feb 2024 14:00:40 +0000 Subject: [PATCH 113/123] minor fix to param --- finn-rtllib/mvu/tb/mvu_axi_tb.sv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/finn-rtllib/mvu/tb/mvu_axi_tb.sv b/finn-rtllib/mvu/tb/mvu_axi_tb.sv index 8614e9f811..08e8679214 100644 --- a/finn-rtllib/mvu/tb/mvu_axi_tb.sv +++ b/finn-rtllib/mvu/tb/mvu_axi_tb.sv @@ -35,7 +35,7 @@ module mvu_axi_tb(); //-------------------- Simulation parameters --------------------\\ // Matrix & parallelism config - localparam bit IS_MVU = 0; + localparam bit IS_MVU = 1; localparam string COMPUTE_CORE = "mvu_vvu_8sx9_dsp58"; localparam int unsigned MW = 36; localparam int unsigned MH = 4; From b51837b3a366a85f15931d7bc8a1aef0dc82494b Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Wed, 7 Feb 2024 09:38:59 +0000 Subject: [PATCH 114/123] added RTL-based MVAU and VVAU custom-ops --- src/finn/custom_op/fpgadataflow/rtl/__init__.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/finn/custom_op/fpgadataflow/rtl/__init__.py b/src/finn/custom_op/fpgadataflow/rtl/__init__.py index 914c033584..28e08aa445 100644 --- a/src/finn/custom_op/fpgadataflow/rtl/__init__.py +++ b/src/finn/custom_op/fpgadataflow/rtl/__init__.py @@ -34,6 +34,8 @@ StreamingDataWidthConverter_rtl, ) from finn.custom_op.fpgadataflow.rtl.streamingfifo_rtl import StreamingFIFO_rtl +from finn.custom_op.fpgadataflow.rtl.matrixvectoractivation_rtl import MatrixVectorActivation_rtl +from finn.custom_op.fpgadataflow.rtl.vectorvectoractivation_rtl import VectorVectorActivation_rtl custom_op = dict() @@ -43,3 +45,5 @@ custom_op["FMPadding_rtl"] = FMPadding_rtl custom_op["StreamingDataWidthConverter_rtl"] = StreamingDataWidthConverter_rtl custom_op["StreamingFIFO_rtl"] = StreamingFIFO_rtl +custom_op["MatrixVectorActivation_rtl"] = MatrixVectorActivation_rtl +custom_op["VectorVectorActivation_rtl"] = VectorVectorActivation_rtl From 86465e0316e567b805b209e8eece1d8c87d9158d Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Wed, 7 Feb 2024 09:40:11 +0000 Subject: [PATCH 115/123] [builder]: renamed specialize_to_rtl step to specialize_layers step, default standalone_thresholds set to False --- src/finn/builder/build_dataflow_config.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/finn/builder/build_dataflow_config.py b/src/finn/builder/build_dataflow_config.py index 073bc9e12b..85b7d61ce5 100644 --- a/src/finn/builder/build_dataflow_config.py +++ b/src/finn/builder/build_dataflow_config.py @@ -116,6 +116,7 @@ class VerificationStepType(str, Enum): "step_tidy_up", "step_streamline", "step_convert_to_hls", + "step_specialize_layers", "step_create_dataflow_partition", "step_target_fps_parallelization", "step_apply_folding_config", @@ -139,6 +140,7 @@ class VerificationStepType(str, Enum): "step_tidy_up", "step_streamline", "step_convert_to_hls", + "step_specialize_layers", "step_create_dataflow_partition", "step_target_fps_parallelization", "step_apply_folding_config", @@ -234,7 +236,7 @@ class DataflowBuildConfig: #: activations in FINN) will be implemented as stand-alone HLS layers, #: instead of being part of MatrixVectorActivation layer. This gives larger #: flexibility, and makes it possible to have runtime-writable thresholds. - standalone_thresholds: Optional[bool] = True + standalone_thresholds: Optional[bool] = False #: (Optional) Whether optimizations that minimize the bit width of the #: weights and accumulator will be applied. Because this optimization relies From c8b793c081ea0f72d491c48cef8681222f91b6f0 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Wed, 7 Feb 2024 09:40:45 +0000 Subject: [PATCH 116/123] [builder]: added first version of specialize_layer step --- src/finn/builder/build_dataflow_steps.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py index 2629efef11..b74dc7adc5 100644 --- a/src/finn/builder/build_dataflow_steps.py +++ b/src/finn/builder/build_dataflow_steps.py @@ -53,7 +53,7 @@ from shutil import copy import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls -import finn.transformation.fpgadataflow.specialize_to_rtl_layers as to_rtl +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers import finn.transformation.streamline.absorb as absorb from finn.analysis.fpgadataflow.dataflow_performance import dataflow_performance from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer @@ -473,12 +473,9 @@ def step_generate_estimate_reports(model: ModelWrapper, cfg: DataflowBuildConfig return model -def step_specialize_to_rtl(model: ModelWrapper, cfg: DataflowBuildConfig): - """Convert layers implemented in HLS to an equivalent specialized RTL - implementation if possible.""" - specialize_to_rtl_transforms = [to_rtl.InferRTLMatrixVectorActivation()] - for trn in specialize_to_rtl_transforms: - model = model.transform(trn) +def step_specialize_layers(model: ModelWrapper, cfg: DataflowBuildConfig): + """Convert HW custom-ops into custom-ops suitable for FPGA implementation either with HLS or RTL backend.""" + model = model.transform(SpecializeLayers()) return model @@ -844,7 +841,7 @@ def step_deployment_package(model: ModelWrapper, cfg: DataflowBuildConfig): "step_apply_folding_config": step_apply_folding_config, "step_minimize_bit_width": step_minimize_bit_width, "step_generate_estimate_reports": step_generate_estimate_reports, - "step_specialize_to_rtl": step_specialize_to_rtl, + "step_specialize_layers": step_specialize_layers, "step_hls_codegen": step_hls_codegen, "step_hls_ipgen": step_hls_ipgen, "step_set_fifo_depths": step_set_fifo_depths, From 79ff91137c9b753807803d76b60aea6f169ca59f Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 8 Feb 2024 14:41:26 +0000 Subject: [PATCH 117/123] pulled latest changes --- .../fpgadataflow/create_stitched_ip.py | 37 ++- .../fpgadataflow/set_folding.py | 4 +- .../test_fpgadataflow_mvau_rtl.py | 123 +++++---- .../test_fpgadataflow_vvau_rtl.py | 234 ++++++++++++++++++ 4 files changed, 339 insertions(+), 59 deletions(-) create mode 100644 tests/fpgadataflow/test_fpgadataflow_vvau_rtl.py diff --git a/src/finn/transformation/fpgadataflow/create_stitched_ip.py b/src/finn/transformation/fpgadataflow/create_stitched_ip.py index 1c316e1285..f797e3d841 100644 --- a/src/finn/transformation/fpgadataflow/create_stitched_ip.py +++ b/src/finn/transformation/fpgadataflow/create_stitched_ip.py @@ -48,13 +48,12 @@ def is_external_input(model, node, i): # True only if input is unconnected and has no initializer # Only esception is second input of FC layers when mem_mode is external node_inst = getCustomOp(node) - op_type = node.op_type producer = model.find_producer(node.input[i]) if producer is None: if model.get_initializer(node.input[i]) is None: return True else: - if op_type.startswith("MatrixVectorActivation"): + if node.op_type == "MatrixVectorActivation": if node_inst.get_nodeattr("mem_mode") == "external": return True return False @@ -103,6 +102,7 @@ def __init__(self, fpgapart, clk_ns, ip_name="finn_design", vitis=False, signatu # keep track of top-level interface names self.intf_names = { "clk": [], + "clk2x": [], "rst": [], "s_axis": [], "m_axis": [], @@ -110,10 +110,19 @@ def __init__(self, fpgapart, clk_ns, ip_name="finn_design", vitis=False, signatu "axilite": [], } + def _is_double_pumped(self, node): + try: + pumped_compute = getCustomOp(node).get_nodeattr("pumpedCompute") + return pumped_compute==1 + except: + return False + def connect_clk_rst(self, node): inst_name = node.name node_inst = getCustomOp(node) clock_intf_name = node_inst.get_verilog_top_module_intf_names()["clk"][0] + if self._is_double_pumped(node): + clock2x_intf_name = node_inst.get_verilog_top_module_intf_names()["clk2x"][0] reset_intf_name = node_inst.get_verilog_top_module_intf_names()["rst"][0] # make clock and reset external, if they aren't already if not self.clock_reset_are_external: @@ -128,6 +137,22 @@ def connect_clk_rst(self, node): self.clock_reset_are_external = True self.intf_names["clk"] = ["ap_clk"] self.intf_names["rst"] = ["ap_rst_n"] + # make clk2x external, if it isn't already and connect clk and reset + elif self._is_double_pumped(node) and not self.clock2x_is_external: + self.connect_cmds.append( + "make_bd_pins_external [get_bd_pins %s/%s]" % (inst_name, clock2x_intf_name) + ) + self.connect_cmds.append("set_property name ap_clk2x [get_bd_ports ap_clk2x_0]") + self.clock2x_is_external = True + self.intf_names["clk2x"] = ["ap_clk2x"] + self.connect_cmds.append( + "connect_bd_net [get_bd_ports ap_rst_n] [get_bd_pins %s/%s]" + % (inst_name, reset_intf_name) + ) + self.connect_cmds.append( + "connect_bd_net [get_bd_ports ap_clk] [get_bd_pins %s/%s]" + % (inst_name, clock_intf_name) + ) # otherwise connect clock and reset else: self.connect_cmds.append( @@ -138,6 +163,11 @@ def connect_clk_rst(self, node): "connect_bd_net [get_bd_ports ap_clk] [get_bd_pins %s/%s]" % (inst_name, clock_intf_name) ) + if self._is_double_pumped(node): + self.connect_cmds.append( + "connect_bd_net [get_bd_ports ap_clk2x] [get_bd_pins %s/%s]" + % (inst_name, clock2x_intf_name) + ) def connect_axi(self, node): inst_name = node.name @@ -285,7 +315,7 @@ def apply(self, model): ip_dirs.append("$::env(FINN_ROOT)/finn-rtllib/memstream") if self.signature: ip_dirs.append("$::env(FINN_ROOT)/finn-rtllib/axi_info") - if model.graph.node[0].op_type not in ["StreamingFIFO", "IODMA_hls"]: + if model.graph.node[0].op_type not in ["StreamingFIFO", "IODMA"]: warnings.warn( """First node is not StreamingFIFO or IODMA. You may experience incorrect stitched-IP rtlsim or hardware @@ -377,6 +407,7 @@ def apply(self, model): fclk_hz = fclk_mhz * 1000000 model.set_metadata_prop("clk_ns", str(self.clk_ns)) tcl.append("set_property CONFIG.FREQ_HZ %d [get_bd_ports /ap_clk]" % round(fclk_hz)) + tcl.append("set_property CONFIG.FREQ_HZ %d [get_bd_ports /ap_clk2x]" % round(2*fclk_hz)) tcl.append("validate_bd_design") tcl.append("save_bd_design") # create wrapper hdl (for rtlsim later on) diff --git a/src/finn/transformation/fpgadataflow/set_folding.py b/src/finn/transformation/fpgadataflow/set_folding.py index 5555237ca3..871919f3f2 100644 --- a/src/finn/transformation/fpgadataflow/set_folding.py +++ b/src/finn/transformation/fpgadataflow/set_folding.py @@ -118,7 +118,7 @@ def apply(self, model): simd_ops = [ "DownSampler", "FMPadding_Batch", - "FMPadding_Pixel", + "FMPadding_Batch_rtl", "ConvolutionInputGenerator", "ConvolutionInputGenerator1D", "ConvolutionInputGenerator_rtl", @@ -131,7 +131,7 @@ def apply(self, model): continue op_type = node.op_type node_inst = getCustomOp(node) - if op_type.startswith("MatrixVectorActivation"): + if op_type == "MatrixVectorActivation": max_simd = node_inst.get_nodeattr("MW") max_pe = node_inst.get_nodeattr("MH") node_inst.set_nodeattr("PE", 1) diff --git a/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py b/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py index 1e9de44fb2..45b33b24e8 100644 --- a/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py +++ b/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py @@ -27,6 +27,8 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import pytest +import os +import pickle import numpy as np import os @@ -42,13 +44,7 @@ import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls import finn.transformation.fpgadataflow.specialize_to_rtl_layers as to_rtl from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP -from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP -from finn.transformation.fpgadataflow.prepare_ip import PrepareIP -from finn.transformation.fpgadataflow.set_fifo_depths import InsertAndSetFIFODepths - - -from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim -from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from qonnx.custom_op.registry import getCustomOp build_dir = os.environ["FINN_BUILD_DIR"] @@ -70,52 +66,58 @@ def make_single_matmul_modelwrapper(ifm, ofm, idt, wdt, W): return model - def prepare_inputs(input_tensor): - return {"global_in": input_tensor} - - -# @pytest.mark.parametrize("mh", [36]) -# @pytest.mark.parametrize("mw", [256]) -@pytest.mark.parametrize("mh", [9]) -@pytest.mark.parametrize("mw", [36]) -# @pytest.mark.parametrize("pe", [1, 4, 9, 36]) -# @pytest.mark.parametrize("simd", [1, 4, 16, 64, 256]) -@pytest.mark.parametrize("pe", [1, 3, 9]) -@pytest.mark.parametrize("simd", [1, 3, 6, 18, 36]) -@pytest.mark.parametrize("idt", [DataType["UINT4"], DataType["UINT8"]]) -@pytest.mark.parametrize("wdt", [DataType["INT4"], DataType["INT8"]]) -# @pytest.mark.parametrize("part", ["xcvc1902-vsva2197-2MP-e-S", "xcku3p-ffva676-1-e"]) + return {"ifm": input_tensor} + +@pytest.mark.parametrize("mh", [4]) +# @pytest.mark.parametrize("mw", [36]) +@pytest.mark.parametrize("mw", [18]) +# @pytest.mark.parametrize("pe", [1,2,4,8]) +@pytest.mark.parametrize("pe", [2]) +# @pytest.mark.parametrize("simd", [1,3,6,9,18,36]) +@pytest.mark.parametrize("simd", [6]) +#@pytest.mark.parametrize("idt", [DataType["UINT4"], DataType["UINT8"]]) +@pytest.mark.parametrize("idt", [DataType["UINT8"]]) +#@pytest.mark.parametrize("wdt", [DataType["INT4"], DataType["INT6"]]) +@pytest.mark.parametrize("wdt", [DataType["INT8"]]) +#@pytest.mark.parametrize("part", ["xcvm1802-vsvd1760-2MP-e-S", "xcku3p-ffva676-1-e"]) +#@pytest.mark.parametrize("part", ["xcvm1802-vsvd1760-2MP-e-S"]) @pytest.mark.parametrize("part", ["xcvc1902-vsva2197-2MP-e-S"]) -@pytest.mark.parametrize("clk_ns", [1.66, 4]) +@pytest.mark.parametrize("segmentlen", [1]) @pytest.mark.fpgadataflow @pytest.mark.slow @pytest.mark.vivado -def test_fpgadataflow_mvau_rtl( - mh, mw, pe, simd, idt, wdt, part, clk_ns -): - if part == "xcku3p-ffva676-1-e" and clk_ns != 1.66: - pytest.skip("Skip test for varying clk for devices other than Versal, since this variable doesn't change anything for this test") - +def test_fpgadataflow_mvau_rtl(mh, mw, pe, simd, idt, wdt, part, segmentlen): + # Synthesis constants + clk_ns = 5 # Create test input vector (produced by SWG) ofm_shape = (5, 5) ofm_h, ofm_w = ofm_shape ifm = helper.make_tensor_value_info("ifm", TensorProto.FLOAT, [1, ofm_h, ofm_w, mw]) ofm = helper.make_tensor_value_info("ofm", TensorProto.FLOAT, (1, ofm_h, ofm_w, mh)) W = gen_finn_dt_tensor(wdt, (mw, mh)) - model = make_single_matmul_modelwrapper(ifm, ofm, idt, wdt, W) + # np.save("weights.npy", W) + ## + # W = np.load("weights.npy") + model = make_single_matmul_modelwrapper(W, ofm_shape, mh, ifm, weights, idt, wdt) model = model.transform(GiveUniqueNodeNames()) model = model.transform(GiveReadableTensorNames()) model.save(build_dir + "/matmul.onnx") # Create MatMul & obtain golden reference output - A = gen_finn_dt_tensor(model.get_tensor_datatype("global_in"), model.get_tensor_shape("global_in")) + A = gen_finn_dt_tensor(model.get_tensor_datatype("ifm"), model.get_tensor_shape("ifm")) + # np.save("activations.npy", A) + ## + # A = np.load("activations.npy") input_dict = prepare_inputs(A) # Execute ONNX model output_matmul = oxe.execute_onnx(model, input_dict)["global_out"] + with open(build_dir + "/onnx_output.pkl", "wb") as f: + pickle.dump(output_matmul, f) + with open(build_dir + "/onnx_output.pkl", "wb") as f: pickle.dump(output_matmul, f) @@ -127,26 +129,41 @@ def test_fpgadataflow_mvau_rtl( folding_config = { "Defaults": {}, "MatrixVectorActivation_0": { - "PE": pe, - "SIMD": simd, - "mem_mode": "decoupled", - "ram_style": "auto", - "resType": "dsp", + "PE" : pe, + "SIMD" : simd, + "mem_mode" : "decoupled", + "ram_style" : "auto", + "resType" : "dsp", "preferred_backend" : "rtl" - }, + } } model = model.transform(ApplyConfig(folding_config)) - model.save(build_dir + "/mvau_hls.onnx") + model.save(build_dir+"/mvau_hls.onnx") + + model = model.transform(SetExecMode("rtlsim")) + model = model.transform(PrepareIP(part, clk_ns)) + model = model.transform(HLSSynthIP()) + model = model.transform(PrepareRTLSim()) + for n in model.graph.node: + getCustomOp(n).set_nodeattr("rtlsim_trace", "mvu_trace_hls.vcd") + output_mvau_hls = oxe.execute_onnx(model, input_dict)["ofm"] + # Apply convert-to-rtl step model = model.transform(to_rtl.InferRTLMatrixVectorActivation()) model = model.transform(GiveUniqueNodeNames()) - model.save(build_dir + "/mvau_rtl.onnx") + for n in model.graph.node: + if n.op_type=="MatrixVectorActivation_rtl": + getCustomOp(n).set_nodeattr("pumpedCompute", 0) + model.save(build_dir+"/mvau_rtl.onnx") # Reset rtlsim_so and ip-related paths such that new Pyverilator SO and IP is generated for n in model.graph.node: - getCustomOp(n).set_nodeattr("rtlsim_trace", build_dir + "/mvu_trace_rtl_nodebynode.vcd") - + getCustomOp(n).set_nodeattr("rtlsim_so", "") + getCustomOp(n).set_nodeattr("code_gen_dir_ipgen", "") + getCustomOp(n).set_nodeattr("ipgen_path", "") + getCustomOp(n).set_nodeattr("ip_path", "") + getCustomOp(n).set_nodeattr("rtlsim_trace", "mvu_trace_rtl.vcd") model = model.transform(SetExecMode("rtlsim")) model = model.transform(PrepareIP(part, clk_ns)) model = model.transform(HLSSynthIP()) @@ -156,19 +173,17 @@ def test_fpgadataflow_mvau_rtl( with open(build_dir + "/mvau_rtl_output.pkl", "wb") as f: pickle.dump(output_mvau_rtl, f) - model.save(build_dir + "/mvau_rtl_sim.onnx") - assert (output_matmul == output_mvau_rtl).all(), "Output of ONNX model not matching output of node-by-node sim!" + with open(build_dir + "/hls_output.pkl", "wb") as f: + pickle.dump(output_mvau_hls, f) - model = model.transform(InsertAndSetFIFODepths(part, clk_ns)) - model = model.transform(PrepareIP(part, clk_ns)) - model = model.transform(HLSSynthIP()) - model = model.transform(CreateStitchedIP(part, clk_ns)) + with open(build_dir + "/rtl_output.pkl", "wb") as f: + pickle.dump(output_mvau_rtl, f) - os.environ["RTLSIM_TRACE_DEPTH"] = "3" - model.set_metadata_prop("rtlsim_so", "") - model.set_metadata_prop("exec_mode", "rtlsim") - model.set_metadata_prop("rtlsim_trace", build_dir + "/mvu_trace_rtl_stitch.vcd") - model.save(build_dir + "/stitched_ip.onnx") - output_mvau_rtl_stitch = oxe.execute_onnx(model, input_dict)["global_out"] + # model = model.transform(PrepareIP(part, clk_ns)) + # model = model.transform(HLSSynthIP()) + # model = model.transform(CreateStitchedIP(fpgapart=part, clk_ns=clk_ns, vitis=True)) + # model.save(build_dir+"/stitched_ip.onnx") - assert (output_matmul == output_mvau_rtl_stitch).all(), "Output of ONNX model not matching output of stitched-IP RTL model!" \ No newline at end of file + #assert (output_mvau_hls == output_mvau_rtl).all() + assert (output_matmul['ofm'] == output_mvau_rtl).all() + # assert (output_mvau_hls.size > 0) diff --git a/tests/fpgadataflow/test_fpgadataflow_vvau_rtl.py b/tests/fpgadataflow/test_fpgadataflow_vvau_rtl.py new file mode 100644 index 0000000000..25fad308ee --- /dev/null +++ b/tests/fpgadataflow/test_fpgadataflow_vvau_rtl.py @@ -0,0 +1,234 @@ +# Copyright (C) 2022, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pytest + +import numpy as np +import os +import pickle +from onnx import TensorProto, helper +from qonnx.core.datatype import DataType +from qonnx.core.modelwrapper import ModelWrapper +from qonnx.custom_op.general.im2col import compute_conv_output_dim +from qonnx.custom_op.registry import getCustomOp +from qonnx.transformation.general import ( + ApplyConfig, + GiveReadableTensorNames, + GiveUniqueNodeNames, +) +from qonnx.transformation.infer_datatypes import InferDataTypes +from qonnx.transformation.infer_shapes import InferShapes +from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul +from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model + +import finn.core.onnx_exec as oxe +import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls +import finn.transformation.fpgadataflow.specialize_to_rtl_layers as to_rtl +from finn.transformation.fpgadataflow.create_dataflow_partition import ( + CreateDataflowPartition, +) +from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP +from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP +from finn.transformation.fpgadataflow.minimize_accumulator_width import ( + MinimizeAccumulatorWidth, +) +from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim +from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.set_fifo_depths import InsertAndSetFIFODepths + +# import qonnx.core.data_layout as DataLayout + +build_dir = os.environ["FINN_BUILD_DIR"] + + +def make_single_dw_conv_modelwrapper(conv_config, idt, wdt): + kernel_size, in_feature_dim, in_chn = conv_config + stride = 1 + pad = 0 + + out_feature_dim = compute_conv_output_dim(in_feature_dim, kernel_size, stride, pad) + group = out_chn = in_chn + + conv_param_shape = [out_chn, 1, kernel_size, kernel_size] + input_shape = [1, in_chn, in_feature_dim, in_feature_dim] + output_shape = [1, out_chn, out_feature_dim, out_feature_dim] + + conv_config = {} + conv_config["dilations"] = [1, 1] + conv_config["group"] = group + conv_config["kernel_shape"] = [kernel_size, kernel_size] + conv_config["pads"] = [pad, pad, pad, pad] + conv_config["strides"] = [stride, stride] + + ifm = helper.make_tensor_value_info("ifm", TensorProto.FLOAT, input_shape) + ofm = helper.make_tensor_value_info("ofm", TensorProto.FLOAT, output_shape) + weights = [helper.make_tensor_value_info("weights", TensorProto.FLOAT, conv_param_shape)] + + modelproto = qonnx_make_model( + helper.make_graph( + name="conv_test", + inputs=[ifm], + outputs=[ofm], + value_info=weights, + nodes=[helper.make_node("Conv", ["ifm", "weights"], ["ofm"], **conv_config)], + ) + ) + + model = ModelWrapper(modelproto) + model.set_tensor_datatype("ifm", idt) + model.set_tensor_datatype("weights", wdt) + model.set_initializer("weights", gen_finn_dt_tensor(wdt, conv_param_shape)) + + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + + return model + + +def prepare_inputs(input_tensor): + return {"global_in": input_tensor} + + +@pytest.mark.parametrize("kernel_size", [3]) +@pytest.mark.parametrize("in_feature_dim", [5]) +@pytest.mark.parametrize("in_chn", [4]) +@pytest.mark.parametrize("idt", [DataType["INT8"]]) +# @pytest.mark.parametrize("idt", [DataType["UINT8"]]) +@pytest.mark.parametrize("wdt", [DataType["INT6"]]) +@pytest.mark.parametrize("part", ["xcvm1802-vsvd1760-2MP-e-S"]) +@pytest.mark.parametrize("segmentlen", [1]) +@pytest.mark.parametrize("pe", [1, 2, 4]) +@pytest.mark.parametrize("simd", [1, 3, 9]) +@pytest.mark.fpgadataflow +@pytest.mark.slow +@pytest.mark.vivado +def test_fpgadataflow_vvau_rtl( + kernel_size, in_feature_dim, in_chn, idt, wdt, part, segmentlen, pe, simd +): + # Create depthwise-separable convolution + conv_config = (kernel_size, in_feature_dim, in_chn) + model = make_single_dw_conv_modelwrapper(conv_config, idt, wdt) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(GiveReadableTensorNames()) + model.save(build_dir + "/dw_conv.onnx") + + # Obtain golden reference output + golden_in = gen_finn_dt_tensor( + model.get_tensor_datatype("global_in"), model.get_tensor_shape("global_in") + ) + input_dict = prepare_inputs(golden_in) + golden_out = oxe.execute_onnx(model, input_dict, return_full_exec_context=True) + with open(build_dir + "/onnx_dws_conv.pkl", "wb") as f: + pickle.dump(golden_out, f) + + # Convert to HLS custom-op first + model = model.transform(LowerConvsToMatMul()) + model = model.transform(to_hls.InferConvInpGen(use_rtl_variant=True)) + model = model.transform(to_hls.InferVectorVectorActivation()) + model = model.transform(MinimizeAccumulatorWidth()) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(GiveReadableTensorNames()) + model.save(build_dir + "/hls_vvau.onnx") + + # Apply folding (i.e. specify to use DSPs) + folding_config = { + "Defaults": {}, + "ConvolutionInputGenerator_rtl_0": {"SIMD": 4, "parallel_window": 1}, + "VectorVectorActivation_0": { + "PE": pe, + "SIMD": simd, + "mem_mode": "decoupled", + "ram_style": "auto", + "resType": "dsp", + "preferred_backend": "rtl", + }, + } + model = model.transform(ApplyConfig(folding_config)) + model.save(build_dir + "/hls_vvau_folded.onnx") + + # Obtain second reference from HLS-based VVAU layer + model = model.transform(SetExecMode("rtlsim")) + model = model.transform(PrepareIP(part, 5)) + model = model.transform(HLSSynthIP()) + model = model.transform(PrepareRTLSim()) + conv_hls_out = oxe.execute_onnx(model, input_dict, return_full_exec_context=True) + with open(build_dir + "/hls_vvau_folded_output.pkl", "wb") as f: + pickle.dump(conv_hls_out, f) + + # Stitched-IP RTLsim + model = model.transform(CreateDataflowPartition(partition_model_dir=build_dir)) + model.save(build_dir + "/ip-stitched.onnx") + partition_model_path = getCustomOp( + model.get_nodes_by_op_type("StreamingDataflowPartition")[0] + ).get_nodeattr("model") + partitioned_model = ModelWrapper(partition_model_path) + # FIFOs needed for stitched-ip RTLsim, DWC needed for VVU operating on SIMD parallelism + partitioned_model = partitioned_model.transform(InsertAndSetFIFODepths(part, 5)) + partitioned_model = partitioned_model.transform(PrepareIP(part, 5)) + partitioned_model = partitioned_model.transform(HLSSynthIP()) + partitioned_model.save(build_dir + "/partitioned_model.onnx") + partitioned_model = partitioned_model.transform(CreateStitchedIP(part, 5)) + partitioned_model.save(partition_model_path) + partitioned_model.set_metadata_prop("rtlsim_trace", build_dir + "/hls-vvu.vcd") + # set top-level prop for stitched-ip rtlsim and launch + partitioned_model.set_metadata_prop("exec_mode", "rtlsim") + # transpose input since we're now simulating HW layers (NCHW --> NHWC) + input_dict["global_in"] = np.transpose(input_dict["global_in"], (0, 2, 3, 1)) + stitched_ip_out = oxe.execute_onnx(partitioned_model, input_dict, return_full_exec_context=True) + with open(build_dir + "/stitched_ip_output.pkl", "wb") as f: + pickle.dump(stitched_ip_out, f) + + # Apply convert-to-rtl step + partitioned_model = partitioned_model.transform(to_rtl.InferRTLVectorVectorActivation()) + partitioned_model = partitioned_model.transform(GiveUniqueNodeNames()) + partitioned_model = partitioned_model.transform(GiveReadableTensorNames()) + partitioned_model = partitioned_model.transform(PrepareIP(part, 5)) + partitioned_model = partitioned_model.transform(HLSSynthIP()) + partitioned_model = partitioned_model.transform(CreateStitchedIP(part, 5)) + partitioned_model.save(build_dir + "/partition_rtl_vvau.onnx") + partitioned_model.set_metadata_prop("rtlsim_trace", build_dir + "/rtl-vvu.vcd") + # Reset rtlsim_so path to re-generate Pyverilator sim object + partitioned_model.set_metadata_prop("rtlsim_so", "") + # set top-level prop for stitched-ip rtlsim and launch + partitioned_model.set_metadata_prop("exec_mode", "rtlsim") + vvu_rtl_out = oxe.execute_onnx(partitioned_model, input_dict, return_full_exec_context=True) + with open(build_dir + "/rtl_vvau_output.pkl", "wb") as f: + pickle.dump(vvu_rtl_out, f) + + golden_ret = golden_out["global_out"] + # tranpose hardware-generated outputs NHWC -> NCHW to be comparable + vvu_rtl_ret = vvu_rtl_out["global_out"].transpose(0, 3, 1, 2) + hls_ret = stitched_ip_out["global_out"].transpose(0, 3, 1, 2) + + assert ( + vvu_rtl_ret == golden_ret + ).all(), "Output of ONNX model not matching output of stitched-IP RTL model!" + assert ( + vvu_rtl_ret == hls_ret + ).all(), "Output of stitched-IP HLS model not matching output of stitched-IP RTL model!" From 3f9e85ce68edc0835a06850d34ed7eca0a01c53c Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 8 Feb 2024 16:30:15 +0000 Subject: [PATCH 118/123] fixed broken merge --- src/finn/custom_op/fpgadataflow/__init__.py | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py index 70ea858b66..1f2c2740bb 100644 --- a/src/finn/custom_op/fpgadataflow/__init__.py +++ b/src/finn/custom_op/fpgadataflow/__init__.py @@ -41,12 +41,7 @@ from finn.custom_op.fpgadataflow.labelselect import LabelSelect from finn.custom_op.fpgadataflow.lookup import Lookup from finn.custom_op.fpgadataflow.matrixvectoractivation import MatrixVectorActivation -<<<<<<< HEAD from finn.custom_op.fpgadataflow.pool import Pool -======= -from finn.custom_op.fpgadataflow.matrixvectoractivation_rtl import MatrixVectorActivation_rtl -from finn.custom_op.fpgadataflow.pool_batch import Pool_Batch ->>>>>>> origin/feature/mvu_vvu_dsp_pumping from finn.custom_op.fpgadataflow.streamingdataflowpartition import ( StreamingDataflowPartition, ) @@ -65,21 +60,10 @@ # make sure new HLSCustomOp subclasses are imported here so that they get # registered and plug in correctly into the infrastructure custom_op["MatrixVectorActivation"] = MatrixVectorActivation -<<<<<<< HEAD -======= -custom_op["MatrixVectorActivation_rtl"] = MatrixVectorActivation_rtl -custom_op["ConvolutionInputGenerator"] = ConvolutionInputGenerator -custom_op["ConvolutionInputGenerator1D"] = ConvolutionInputGenerator1D -custom_op["ConvolutionInputGenerator_rtl"] = ConvolutionInputGenerator_rtl -custom_op["TLastMarker"] = TLastMarker -custom_op["StreamingDataWidthConverter_Batch"] = StreamingDataWidthConverter_Batch -custom_op["StreamingDataWidthConverter_rtl"] = StreamingDataWidthConverter_rtl ->>>>>>> origin/feature/mvu_vvu_dsp_pumping custom_op["StreamingFIFO"] = StreamingFIFO custom_op["Thresholding"] = Thresholding custom_op["VectorVectorActivation"] = VectorVectorActivation custom_op["StreamingDataflowPartition"] = StreamingDataflowPartition - custom_op["AddStreams"] = AddStreams custom_op["ChannelwiseOp"] = ChannelwiseOp custom_op["ConvolutionInputGenerator"] = ConvolutionInputGenerator From 1245763291cb536f31bb91b187cba1ea99014ad9 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 8 Feb 2024 16:31:20 +0000 Subject: [PATCH 119/123] [mvau hls]: added lut/dsp estimation functions, function for stitching the ip and small fix to node execution --- .../hls/matrixvectoractivation_hls.py | 94 ++++++++++++++++++- 1 file changed, 92 insertions(+), 2 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py b/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py index 5206ee3a06..d6d122e41b 100644 --- a/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py @@ -33,6 +33,7 @@ from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend from finn.custom_op.fpgadataflow.matrixvectoractivation import MatrixVectorActivation from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy +from pyverilator.util.axi_utils import toggle_clk, reset_rtlsim # ONNX i/o tensor shape assumptions for MatrixVectorActivation: # input 0 is the input tensor, shape (.., i_size) = (..., MW) @@ -54,6 +55,84 @@ def get_nodeattr_types(self): my_attrs.update(HLSBackend.get_nodeattr_types(self)) return my_attrs + def lut_estimation(self): + """Calculates resource estimations for LUTs based on: + - FINN-R: An End-to-End Deep-Learning Framework for Fast + Exploration of Quantized Neural Networks + - M. Blott, T. B. Preusser, N. J. Fraser, G. Gambardella, K. O'Brien, + Y. Umuroglu, M. Leeser and K. Vissers + - 12. Sep 2018 + """ + # TODO add in/out FIFO contributions + P = self.get_nodeattr("PE") + Q = self.get_nodeattr("SIMD") + MW = self.get_nodeattr("MW") + wdt = self.get_weight_datatype() + W = wdt.bitwidth() + # determine tdt with input and weight data types + idt = self.get_input_datatype() + A = idt.bitwidth() + # parameters from experiments in paper mentioned above + c0 = 300 + c1 = 1.1 + c2 = 0 + mmode = self.get_nodeattr("mem_mode") + mstyle = self.get_nodeattr("ram_style") + if (mmode == "decoupled" and mstyle == "distributed") or ( + mmode == "const" and self.calc_wmem() <= 128 + ): + c2 = (P * Q * W) * math.ceil(self.calc_wmem() / 64) + + # multiplication + res_type = self.get_nodeattr("resType") + if res_type == "dsp": + mult_luts = 0 + else: + mult_luts = Q * (2 * math.ceil((W + A) / 6) - 1) * (W + A) + # adder tree + addertree_luts = (W + A) * (2 * Q - 1) + # accumulator + acc_datatype = self.get_accumulator_datatype() + # if accDataType is not set, then it will default to INT32, which would + # be a large overestimate in most (if not all) cases. In this scenario, + # we would use the minimum accumulator as determined by the data types + # bound, derived in https://arxiv.org/abs/2301.13376 + alpha = math.log(MW, 2) + W + A - 1 - int(idt.signed()) + acc_bits = min( + acc_datatype.bitwidth(), + np.ceil(alpha + math.log(1 + pow(2, -alpha), 2) + 1), + ) + acc_luts = acc_bits + # thresholds and threshold comparators + thr_luts = 0 + comp_luts = 0 + noact = self.get_nodeattr("noActivation") + tmem_style = self.get_nodeattr("ram_style_thresholds") + if (noact == 0) and (tmem_style == "distributed"): + odt = self.get_output_datatype() + B = odt.bitwidth() + thr_luts = (2**B - 1) * acc_bits * math.ceil(self.calc_tmem() / 64) + comp_luts = (2**B - 1) * acc_bits + + return int( + c0 + c1 * (P * (mult_luts + addertree_luts + acc_luts + thr_luts + comp_luts)) + c2 + ) + + def dsp_estimation(self): + # multiplication + P = self.get_nodeattr("PE") + res_type = self.get_nodeattr("resType") + Q = self.get_nodeattr("SIMD") + wdt = self.get_weight_datatype() + W = wdt.bitwidth() + idt = self.get_input_datatype() + A = idt.bitwidth() + if res_type == "dsp": + mult_dsp = P * Q * np.ceil((W + A) / 48) # TODO: more accurate modelling + else: + mult_dsp = 0 + return int(mult_dsp) + def get_template_param_values(self): """Returns the template parameter values according to input, output and weight data types.""" @@ -468,8 +547,8 @@ def execute_node(self, context, graph): sim = self.get_rtlsim() nbits = self.get_instream_width() inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits) - self.reset_rtlsim(sim) - self.toggle_clk(sim) + reset_rtlsim(sim) + toggle_clk(sim) if mem_mode in ["external", "decoupled"]: wnbits = self.get_weightstream_width() export_wdt = self.get_weight_datatype() @@ -501,3 +580,14 @@ def execute_node(self, context, graph): mode ) ) + + def code_generation_ipi(self, cmd): + # instantiate the HLS IP + vlnv = self.get_nodeattr("ip_vlnv") + if self.get_nodeattr("mem_mode") == "decoupled": + cmd.append( + "create_bd_cell -type ip -vlnv %s /%s/%s" + % (vlnv, node_name, node_name) + ) + else: + cmd.append("create_bd_cell -type ip -vlnv %s %s" % (vlnv, self.onnx_node.name)) \ No newline at end of file From 9a4dd046b51a26c46fd9c25dcd064d7ae7c81826 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 8 Feb 2024 16:32:06 +0000 Subject: [PATCH 120/123] [hwcustom op]: removed do_reset flag --- src/finn/custom_op/fpgadataflow/hwcustomop.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/finn/custom_op/fpgadataflow/hwcustomop.py b/src/finn/custom_op/fpgadataflow/hwcustomop.py index 773938525b..f62cf1af8a 100644 --- a/src/finn/custom_op/fpgadataflow/hwcustomop.py +++ b/src/finn/custom_op/fpgadataflow/hwcustomop.py @@ -307,7 +307,6 @@ def rtlsim_multi_io(self, sim, io_dict): trace_file=trace_file, sname=sname, liveness_threshold=pyverilate_get_liveness_threshold_cycles(), - do_reset=True, ) self.set_nodeattr("cycles_rtlsim", total_cycle_count) From 30f6ddf91ebcbee60bb0fbbf972e2597ce1229ec Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 8 Feb 2024 16:32:45 +0000 Subject: [PATCH 121/123] [mvau hw-op]: moved lut/dsp estimations to specialized ops, modified stitched-ip method --- .../fpgadataflow/matrixvectoractivation.py | 89 +------------------ 1 file changed, 4 insertions(+), 85 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py index 145cf4f6e6..74aee63dc1 100644 --- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py +++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py @@ -438,84 +438,6 @@ def uram_efficiency_estimation(self): uram_est_capacity = uram_est * 72 * 4096 return wbits / uram_est_capacity - def lut_estimation(self): - """Calculates resource estimations for LUTs based on: - - FINN-R: An End-to-End Deep-Learning Framework for Fast - Exploration of Quantized Neural Networks - - M. Blott, T. B. Preusser, N. J. Fraser, G. Gambardella, K. O'Brien, - Y. Umuroglu, M. Leeser and K. Vissers - - 12. Sep 2018 - """ - # TODO add in/out FIFO contributions - P = self.get_nodeattr("PE") - Q = self.get_nodeattr("SIMD") - MW = self.get_nodeattr("MW") - wdt = self.get_weight_datatype() - W = wdt.bitwidth() - # determine tdt with input and weight data types - idt = self.get_input_datatype() - A = idt.bitwidth() - # parameters from experiments in paper mentioned above - c0 = 300 - c1 = 1.1 - c2 = 0 - mmode = self.get_nodeattr("mem_mode") - mstyle = self.get_nodeattr("ram_style") - if (mmode == "decoupled" and mstyle == "distributed") or ( - mmode == "const" and self.calc_wmem() <= 128 - ): - c2 = (P * Q * W) * math.ceil(self.calc_wmem() / 64) - - # multiplication - res_type = self.get_nodeattr("resType") - if res_type == "dsp": - mult_luts = 0 - else: - mult_luts = Q * (2 * math.ceil((W + A) / 6) - 1) * (W + A) - # adder tree - addertree_luts = (W + A) * (2 * Q - 1) - # accumulator - acc_datatype = self.get_accumulator_datatype() - # if accDataType is not set, then it will default to INT32, which would - # be a large overestimate in most (if not all) cases. In this scenario, - # we would use the minimum accumulator as determined by the data types - # bound, derived in https://arxiv.org/abs/2301.13376 - alpha = math.log(MW, 2) + W + A - 1 - int(idt.signed()) - acc_bits = min( - acc_datatype.bitwidth(), - np.ceil(alpha + math.log(1 + pow(2, -alpha), 2) + 1), - ) - acc_luts = acc_bits - # thresholds and threshold comparators - thr_luts = 0 - comp_luts = 0 - noact = self.get_nodeattr("noActivation") - tmem_style = self.get_nodeattr("ram_style_thresholds") - if (noact == 0) and (tmem_style == "distributed"): - odt = self.get_output_datatype() - B = odt.bitwidth() - thr_luts = (2**B - 1) * acc_bits * math.ceil(self.calc_tmem() / 64) - comp_luts = (2**B - 1) * acc_bits - - return int( - c0 + c1 * (P * (mult_luts + addertree_luts + acc_luts + thr_luts + comp_luts)) + c2 - ) - - def dsp_estimation(self): - # multiplication - P = self.get_nodeattr("PE") - res_type = self.get_nodeattr("resType") - Q = self.get_nodeattr("SIMD") - wdt = self.get_weight_datatype() - W = wdt.bitwidth() - idt = self.get_input_datatype() - A = idt.bitwidth() - if res_type == "dsp": - mult_dsp = P * Q * np.ceil((W + A) / 48) # TODO: more accurate modelling - else: - mult_dsp = 0 - return int(mult_dsp) - def get_exp_cycles(self): pe = self.get_nodeattr("PE") simd = self.get_nodeattr("SIMD") @@ -955,12 +877,9 @@ def code_generation_ipi(self): "create_bd_intf_pin -mode Slave " "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, din_name) ) - # instantiate the hls ip - cmd.append( - "create_bd_cell -type ip -vlnv %s /%s/%s" - % (self.get_nodeattr("ip_vlnv"), node_name, node_name) - ) - + # Instantiate either the HLS or RTL IP depending on operator + self.code_generation_ipi(cmd) + # instantiate a streamer and connect it to the HLS IP strm_vlnv = "amd.com:finn:memstream:1.0" strm_inst = node_name + "_wstrm" @@ -1031,7 +950,7 @@ def code_generation_ipi(self): cmd.append("save_bd_design") elif mem_mode == "const" or mem_mode == "external": # base class impl sufficient for const/external modes - return super().code_generation_ipi() + self.code_generation_ipi(cmd) else: raise Exception("Unrecognized mem_mode for MatrixVectorActivation") return cmd From 12ad48c6fdf1166068719ac98ae4469ce71d49a5 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Tue, 13 Feb 2024 11:21:32 +0000 Subject: [PATCH 122/123] [hls mvau]: fixed cppsim bipolar activations, added call to util functions from pyverilator and method to wire in IP --- .../hls/matrixvectoractivation_hls.py | 21 +++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py b/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py index d6d122e41b..aa3631a240 100644 --- a/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py @@ -495,6 +495,7 @@ def execute_node(self, context, graph): mem_mode = self.get_nodeattr("mem_mode") node = self.onnx_node + # TODO ensure codegen dir exists if mode == "cppsim": code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") elif mode == "rtlsim": @@ -512,6 +513,7 @@ def execute_node(self, context, graph): for inputs in node.input: # it is assumed that the first input of the node is the data input # the second input are the weights + # the third input are the thresholds if in_ind == 0: assert ( str(context[inputs].dtype) == "float32" @@ -519,7 +521,12 @@ def execute_node(self, context, graph): not float32 as expected.""" expected_inp_shape = self.get_folded_input_shape() reshaped_input = context[inputs].reshape(expected_inp_shape) - export_idt = self.get_input_datatype() + if self.get_input_datatype() == DataType["BIPOLAR"]: + # store bipolar activations as binary + reshaped_input = (reshaped_input + 1) / 2 + export_idt = DataType["BINARY"] + else: + export_idt = self.get_input_datatype() # make copy before saving the array reshaped_input = reshaped_input.copy() np.save( @@ -549,9 +556,13 @@ def execute_node(self, context, graph): inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits) reset_rtlsim(sim) toggle_clk(sim) - if mem_mode in ["external", "decoupled"]: + if mem_mode == "external" or mem_mode == "decoupled": wnbits = self.get_weightstream_width() export_wdt = self.get_weight_datatype() + # we have converted bipolar weights to binary for export, + # so use it as such for weight generation + if self.get_weight_datatype() == DataType["BIPOLAR"]: + export_wdt = DataType["BINARY"] wei = npy_to_rtlsim_input("{}/weights.npy".format(code_gen_dir), export_wdt, wnbits) num_w_reps = np.prod(self.get_nodeattr("numInputVectors")) io_dict = { @@ -568,6 +579,7 @@ def execute_node(self, context, graph): out_npy_path = "{}/output.npy".format(code_gen_dir) out_shape = self.get_folded_output_shape() rtlsim_output_to_npy(output, out_npy_path, odt, out_shape, packed_bits, target_bits) + # load and reshape output output = np.load(out_npy_path) oshape = self.get_normal_output_shape() @@ -576,14 +588,15 @@ def execute_node(self, context, graph): else: raise Exception( """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to "rtlsim" """.format( + has to be set to one of the following value ("cppsim", "rtlsim")""".format( mode ) ) - def code_generation_ipi(self, cmd): + def instantiate_ip(self, cmd): # instantiate the HLS IP vlnv = self.get_nodeattr("ip_vlnv") + node_name = self.onnx_node.name if self.get_nodeattr("mem_mode") == "decoupled": cmd.append( "create_bd_cell -type ip -vlnv %s /%s/%s" From 3202dc1331aea3cd1e71663d5087fe2d479b5ac3 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Tue, 13 Feb 2024 11:22:49 +0000 Subject: [PATCH 123/123] [hw mvau]: fixed bug for executing 2D arrays, modified create-stitched-ip method and reverted default resType to LUT --- .../fpgadataflow/matrixvectoractivation.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py index 74aee63dc1..28c0c24c09 100644 --- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py +++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py @@ -63,7 +63,7 @@ def get_nodeattr_types(self): "SIMD": ("i", True, 0), "MW": ("i", True, 0), "MH": ("i", True, 0), - "resType": ("s", False, "dsp", {"auto", "lut", "dsp"}), + "resType": ("s", False, "lut", {"auto", "lut", "dsp"}), "ActVal": ("i", False, 0), # FINN DataTypes for inputs, weights, outputs "inputDataType": ("s", True, ""), @@ -152,11 +152,13 @@ def execute_node(self, context, graph): odt_is_bipolar = self.get_nodeattr("outputDataType") == "BIPOLAR" out_scale = 2 if odt_is_bipolar else 1 out_bias = -1 if odt_is_bipolar else self.get_nodeattr("ActVal") - # NHWC to NCHW for multithreshold node - result = result.transpose((0, 3, 1, 2)) + if result.ndim == 4: + # NHWC to NCHW for multithreshold node + result = result.transpose((0, 3, 1, 2)) result = multithreshold(result, mvau_thr, out_scale, out_bias) - # NCHW to NHWC - result = result.transpose((0, 2, 3, 1)) + if result.ndim == 4: + # NCHW to NHWC + result = result.transpose((0, 2, 3, 1)) context[node.output[0]] = result @@ -878,7 +880,7 @@ def code_generation_ipi(self): "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, din_name) ) # Instantiate either the HLS or RTL IP depending on operator - self.code_generation_ipi(cmd) + self.instantiate_ip(cmd) # instantiate a streamer and connect it to the HLS IP strm_vlnv = "amd.com:finn:memstream:1.0" @@ -950,7 +952,7 @@ def code_generation_ipi(self): cmd.append("save_bd_design") elif mem_mode == "const" or mem_mode == "external": # base class impl sufficient for const/external modes - self.code_generation_ipi(cmd) + self.instantiate_ip(cmd) else: raise Exception("Unrecognized mem_mode for MatrixVectorActivation") return cmd