fastmachinelearning · Harsh9650 · Mar 5, 2024
diff --git a/src/qonnx/analysis/l0_resource_estimates.py b/src/qonnx/analysis/l0_resource_estimates.py
@@ -0,0 +1,231 @@
+# Copyright (c) 2024 Advanced Micro Devices, Inc.
+# # All rights reserved.
+# #
+# # Redistribution and use in source and binary forms, with or without
+# # modification, are permitted provided that the following conditions are met:
+# #
+# # * Redistributions of source code must retain the above copyright notice, this
+# #   list of conditions and the following disclaimer.
+# #
+# # * Redistributions in binary form must reproduce the above copyright notice,
+# #   this list of conditions and the following disclaimer in the documentation
+# #   and/or other materials provided with the distribution.
+# #
+# # * Neither the name of qonnx nor the names of its
+# #   contributors may be used to endorse or promote products derived from
+# #   this software without specific prior written permission.
+# #
+# # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from qonnx.core.datatype import DataType
+
+"""DSP Type: a) None:
+                    For Fixed Points and floating point
+                        1) When dsp_type is None. All operations will be processed using LUTs.
+                        2) LUTs are calculated using: 1.1*b_width1*b_width2
+                        2) Example:
+                            a) op_mac_Int4_Int2: 1.1*4*2 = 8.8 LUTs.
+                            b) op_mac_Int8_INT8: 1.1*8*8 = 70.4 LUTs.
+                            c) op_mac_Int8_FLOAT16: 1.1*8*16 = 140.8 LUTs
+                            d) op_mac_FLOAT16_FLOAT16: 1.1*16*16 = 281.6 LUTs.
+
+             b) DSP48:
+                    For Fixed Points
+                        1) Everything less than 4 will be promoted to 4. For ex: INT2 will use the same resources as INT4.
+                        2) INT4: One dsp48 + 200 LUTs can accomodate 4 (4*4) bit mac.
+                                So, no of dsp's from mac's can be calculated as (0.25).mac_count + (200*0.5)*mac_count LUTs.
+                        3) Everything between 5 and 8 will be promoted to 8, Ex: INT6 will use the same resources as INT8.
+                        4) INT88: One dsp48 + 200 LUTs can accomodate 2 (8*8) bit mac. So,
+                                 no of dsp's from mac's can be calculated as (0.5).mac_count + (200*0.25)*mac_count LUTs.
+                    For Floating Points
+                        1) FLOAT32: 2 dsp + 700 LUT can accomodate 1 mac count.
+                        2) FLOAT16: 1 dsp + 400 LUT can accomodate 1 mac count.
+            c) DSP58:
+                    For Fixed Points
+                        1) INT8: One dsp58 can accomodate 3 (8*8) bit mac.
+                           So, no of dsp's from mac's can be calculated as (0.33)*mac_count.
+                        2) INT4: One dsp58 can accomodate 4 (4*4) bit mac.
+                           So, no of dsp's from mac's can be calculated as (0.25)*mac_count.
+                        3) INT16: 1 mac count requires 1 dsp.
+                    For Floating Points
+                        1) FLOAT32:  1 mac count requires 1 dsp.
+                        2) FLOAT16: 1 mac count requires 1 dsp.
+    Mapping strategy for On-Chip Memory (bits_per_res):
+                        a) 1 "BRAM", 1 "BRAM36" and 1 "BRAM_36K" can accomodate 36*1024 = 36864 bits.
+                        b) 1 "BRAM_18K" can accomodate 18*1024 = 18432 bits.
+                        c) 1 "URAM" can accomodate 288*1024 = 294912 bits.
+                        d) 1 LUT can accomodate 64 bits.
+"""
+resource_table = {
+    "FLOAT32": {"NONE": (0, 1100), "DSP48": (2, 700), "DSP58": (1, 0)},
+    "FLOAT16": {"NONE": (0, 1100), "DSP48": (1, 400), "DSP58": (1, 0)},
+    "INT32": {"NONE": (0, 1100), "DSP48": (1, 0), "DSP58": (1, 0)},
+    "INT16": {"NONE": (0, 282), "DSP48": (1, 0), "DSP58": (1, 0)},
+    "INT8": {"NONE": (0, 71), "DSP48": (0.5, 100), "DSP58": (0.33, 0)},
+    "INT4": {"NONE": (0, 18), "DSP48": (0.25, 50), "DSP58": (0.25, 0)},
+}
+
+bits_per_res = {"BRAM": 36864, "BRAM36": 36864, "BRAM_36K": 36864, "BRAM_18K": 18432, "URAM": 294912, "LUT": 64}
+
+
+def ocm_resources(num_mem_bits, uram_type, bram_type, d_factor):
+    """Provides an estimate about the number of urams and brams required for the
+       on-chip memory depending upon the distribution factor.
+    Args:
+        num_mem_bits (int): Number of memory bits.
+        d_factor (float): Distribution factor between 0 and 1.
+                         To distribute memory between BRAM and URAM.
+        bram_type (str): can be BRAM, BRAM36, BRAM_36K,BRAM_18K.
+    Returns:
+        A dictionary for ocm resources containing memory requirements for luts, brams and urams
+    """
+    if d_factor is None:
+        luts_req = num_mem_bits / bits_per_res["LUT"]  # neither bram nor uram.
+        ocm_res = {"LUT": luts_req}
+    elif d_factor == 1:  # everything in uram.
+        uram_req = num_mem_bits / bits_per_res[uram_type]  # URAM: 288kbit/URAM
+        ocm_res = {uram_type: uram_req}
+    elif d_factor == 0:  # everything in bram (BRAM_18K/BRAM/BRAM36/BRAM_36K)
+        bram_req = num_mem_bits / bits_per_res[bram_type]
+        ocm_res = {bram_type: bram_req}
+    else:  # both bram and uram.
+        uram_por, bram_por = d_factor, 1 - d_factor
+        bram_req = (bram_por * num_mem_bits) / bits_per_res[bram_type]
+        uram_req = (uram_por * num_mem_bits) / bits_per_res[uram_type]
+        ocm_res = {bram_type: bram_req, uram_type: uram_req}
+    return ocm_res
+
+
+def promoting_datatype(dtype, b_width):
+    """Datatype promoting criterion. Only used when DSPs are used for processing.
+    Args:
+        dtype (str): conatining "INT" or "FLOAT".
+        b_width (int): precision of the respective datatype.
+    Returns:
+        Returns promoted datatype and precision value."""
+
+    if "INT" in dtype:
+        promoted_dtype = "INT"
+        if b_width <= 4:
+            promoted_bwidth = 4
+        elif 4 < b_width <= 8:
+            promoted_bwidth = 8
+        elif 8 < b_width <= 16:
+            promoted_bwidth = 16
+        else:
+            promoted_bwidth = 32
+    elif "FLOAT" in dtype:
+        promoted_dtype = "FLOAT"
+        if b_width <= 16:
+            promoted_bwidth = 16
+        else:
+            promoted_bwidth = 32
+    else:
+        raise Exception("Unsupported data type")
+
+    return promoted_dtype, promoted_bwidth
+
+
+def dtype_casting(dtype1, dtype2, b_width1, b_width2):
+    """Implementing datatype promotion."""
+
+    promoted_dtype1, promoted_bwidth1 = promoting_datatype(dtype1, b_width1)  # either INT or FLOAT
+    promoted_dtype2, promoted_bwidth2 = promoting_datatype(dtype2, b_width2)
+
+    if promoted_dtype1 == promoted_dtype2:  # same datatype
+        if promoted_bwidth1 == promoted_bwidth2:  # same precision.
+            dtype = promoted_dtype1 + str(promoted_bwidth1)  # can also use dtype_2 + new_bwidth2
+        else:  # different precision.
+            if promoted_bwidth1 >= promoted_bwidth2:
+                dtype = promoted_dtype1 + str(promoted_bwidth1)
+            else:
+                dtype = promoted_dtype2 + str(promoted_bwidth2)
+    else:  # dtype_1 != dtype_2 (Different datatype and same/different precision)
+        if promoted_dtype1 == "FLOAT":  # with different datatypes, using float and it's respective precision.
+            dtype = promoted_dtype1 + str(promoted_bwidth1)
+        else:
+            dtype = promoted_dtype2 + str(promoted_bwidth2)
+
+    return dtype
+
+
+def core_resources(inf_cost, dsp_type, bwidth_lower_limit, bwidth_upper_limit):
+    """Provide estimate resources required for the processing ("CORE"), assuming maximum unfolding.
+    Args:
+        inf_cost (dict): Inference cost dict.
+        dsp_type (str): None OR "DSP48" OR "DSP58". Default to None.
+        bwidth_lower_limit (int): Default to 8. It indicates bit values less than 8 will be processed using LUTs.
+        bwidth_upper_limit (int): Default to 32. It indicates bit values less than 32 will be processed using LUTs.
+    Returns:
+        A dictionary containing CORE resource estimates."""
+
+    dsp_res_mac = 0
+    lut_res_mac = 0
+    for i in inf_cost.keys():
+        if "op_mac" in i:
+            mac_count = inf_cost[i]
+            detail_list = i.split("_")
+            dtype1, dtype2 = detail_list[-1], detail_list[-2]
+            b_width1, b_width2 = DataType[dtype1].bitwidth(), DataType[dtype2].bitwidth()
+            if dsp_type is None:  # Computing everything in LUTs.
+                lut_res_mac += 1.1 * b_width1 * b_width2 * mac_count
+                dsp_comp = "DSP"  # default name for DSP and dsp_res_mac = 0
+            else:  # dsp_type == "DSP48" or dsp_type == "DSP58"
+                if (b_width1 < bwidth_lower_limit or b_width2 < bwidth_lower_limit) or (
+                    b_width1 > bwidth_upper_limit or b_width2 > bwidth_upper_limit
+                ):  # Computing everything in LUTs.
+                    lut_res_mac += 1.1 * b_width1 * b_width2 * mac_count  # dsp_res_mac = 0
+                else:
+                    casted_dtype = dtype_casting(dtype1, dtype2, b_width1, b_width2)
+                    casted_bwidth = DataType[casted_dtype].bitwidth()
+                    if casted_bwidth > bwidth_upper_limit:  # Computing everything in LUTs.
+                        lut_res_mac += (
+                            1.1 * b_width1 * b_width2 * mac_count
+                        )  # original bwidth values are used, since dsp_res_mac = 0.
+                    else:
+                        dsp_res_mac += (
+                            resource_table[casted_dtype][dsp_type][0] * mac_count
+                        )  # at index zero, we expect to have dsp factor.
+                        lut_res_mac += (
+                            resource_table[casted_dtype][dsp_type][1] * mac_count
+                        )  # at index one, we expect to have lut factor.
+                dsp_comp = dsp_type  # assigning name as per dsp type.
+        else:
+            continue
+
+    core_res = {"LUT": lut_res_mac, dsp_comp: dsp_res_mac}
+
+    return core_res
+
+
+def l0_resource_estimates(
+    inf_cost, dsp_type=None, uram_type=None, bram_type=None, bwidth_lower_limit=8, bwidth_upper_limit=32, d_factor=None
+):
+    """Provide estimate resources required for the processing ("CORE") and memory ("OCM"), assuming maximum unfolding.
+    Args:
+        inf_cost (dict): Inference cost dict.
+        dsp_type (str): None OR "DSP48" OR "DSP58". Default to None.
+        bram_type (str): Default to "BRAM". It can be BRAM, BRAM36, BRAM_36K, BRAM_18K.
+        bwidth_lower_limit (int): Default to 8. It indicates bit values less than 8 will be processed using LUTs.
+        bwidth_upper_limit (int): Default to 32. It indicates bit values less than 32 will be processed using LUTs.
+        d_factor (float): Default to 1. It can have values between 0 and 1.
+    Returns:
+        A dictionary containing CORE and OCM resource estimates."""
+
+    core_res = core_resources(inf_cost, dsp_type, bwidth_lower_limit, bwidth_upper_limit)
+
+    num_mem_bits = inf_cost["total_mem_w_bits"]
+    ocm_res = ocm_resources(num_mem_bits, uram_type, bram_type, d_factor)
+
+    est_res_req = {"CORE": core_res, "OCM": ocm_res}
+
+    return est_res_req
diff --git a/src/qonnx/util/l0_performance_estimate.py b/src/qonnx/util/l0_performance_estimate.py
@@ -0,0 +1,152 @@
+# Copyright (c) 2024 Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of qonnx nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+from qonnx.analysis.l0_resource_estimates import l0_resource_estimates
+
+""" Calculate the estimate amount of resources required for a model (from inference cost dict).
+    The estimates will be divided into two parts:
+        1) CORE: For processing
+        2) OCM: On-Chip Memory
+    First, a memory check is performed to verify enough memory is availble to accomodate the model on the FPGA.
+    Then, for the resources required for processing (CORE), inference per second is calculated.
+    Args:
+        resource_budget (dict): Representing the resources available in a respective FPGA.
+        inf_cost (dict): Inference cost dict.
+        resource_estimates(): dsp_type (str), bram_type (str), bwidth_lower_limit (int),
+        h_upper_limit (int), d_factor (float)
+        clock_freq: Default 3MHZ.
+    Returns:
+        A dictionary containing CORE and OCM resource estimates.
+    Examples:
+            1) est_res_req: {'CORE': {'LUT': 1198735769600.0, 'DSP48': 3450357760.0},
+                                                'OCM': {'BRAM_18K': 8798, 'URAM': 672}}
+
+            2) resource_budget: {'LUT': 4397752190000, 'BRAM_18K': 1182, 'URAM': 0, 'DSP48': 500000}
+"""
+resource_map = {
+    "res_limit": {
+        "LUT": 0.7,
+        "BRAM": 0.80,
+        "BRAM36": 0.80,
+        "BRAM_36K": 0.80,
+        "BRAM_18K": 0.80,
+        "URAM": 0.80,
+        "DSP48": 0.80,
+        "DSP58": 0.80,
+    },
+    "bits_per_res": {"BRAM": 36864, "BRAM36": 36864, "BRAM_36K": 36864, "BRAM_18K": 18432, "URAM": 294912, "LUT": 64},
+}
+
+
+def d_fact(resource_budget, bits_per_res, uram_type, bram_type):
+    "Determining the d_factor for l0_resource_estimates."
+    if uram_type == bram_type is None:
+        d_factor = None
+    elif uram_type is None and bram_type is not None:
+        d_factor = 0
+    elif bram_type is None and uram_type is not None:
+        d_factor = 1
+    else:
+        available_bits_uram = resource_budget[uram_type] * bits_per_res[uram_type]
+        available_bits_bram = resource_budget[bram_type] * bits_per_res[bram_type]
+        d_factor = available_bits_uram / (available_bits_uram + available_bits_bram)
+    return d_factor
+
+
+def l0_performance_estimate(
+    resource_budget,
+    inf_cost,
+    dsp_type=None,
+    uram_type=None,
+    bram_type=None,
+    bwidth_lower_limit=8,
+    bwidth_upper_limit=32,
+    clock_freq=3000000,
+):
+    expected_inference = {}
+    res_limit, bits_per_res = resource_map["res_limit"], resource_map["bits_per_res"]
+    d_factor = d_fact(resource_budget, bits_per_res, uram_type, bram_type)
+    est_res_req = l0_resource_estimates(
+        inf_cost, dsp_type, uram_type, bram_type, bwidth_lower_limit, bwidth_upper_limit, d_factor
+    )
+    ocm_res_req, core_res_req = est_res_req["OCM"], est_res_req["CORE"]
+    luts_for_mem = (1 - res_limit["LUT"]) * resource_budget["LUT"]  # some amount of LUTs for memory requirement.
+
+    for type, res in ocm_res_req.items():
+        if type == "LUT":
+            luts_req = res
+            resource_tally = luts_for_mem - luts_req
+            if resource_tally >= 0:
+                luts_for_mem = luts_for_mem - luts_req
+                memory_check = True
+            else:
+                luts_for_mem = 0
+                memory_check = False
+                break
+        else:
+            if type in resource_budget.keys():
+                resource_tally = (res_limit[type] * resource_budget[type]) - res
+                if resource_tally >= 0:  # do param fit on ocm.
+                    memory_check = True
+                else:
+                    luts_req = (bits_per_res[type] / bits_per_res["LUT"]) * abs(resource_tally)
+                    resource_tally = (res_limit["LUT"] * luts_for_mem) - luts_req
+                    if resource_tally >= 0:
+                        print(f"{type} out of budget, using luts")
+                        memory_check = True
+                        luts_for_mem = luts_for_mem - luts_req
+                    else:
+                        luts_for_mem = 0
+                        memory_check = False
+                        break
+            else:
+                luts_req = (bits_per_res[type] / bits_per_res["LUT"]) * res
+                resource_tally = luts_for_mem - luts_req
+                if resource_tally >= 0:
+                    print(f"{type} not available in the budget, using luts")
+                    luts_for_mem = luts_for_mem - luts_req
+                    memory_check = True
+                else:
+                    luts_for_mem = 0
+                    memory_check = False
+                    break
+
+    if memory_check is True:
+        for i in core_res_req.keys():
+            if core_res_req[i] > 0:
+                inf_sec = ((res_limit[i] * resource_budget[i]) / core_res_req[i]) * clock_freq
+                expected_inference[i] = inf_sec
+            else:
+                continue
+        min_infc_res = min(expected_inference, key=expected_inference.get)
+        min_infc_sec = expected_inference[min_infc_res]
+        ret = (min_infc_res, min_infc_sec)
+    else:
+        ret = "Memory out of budget"
+    return ret