diff --git a/.github/workflows/docker-image.yml b/.github/workflows/docker-image.yml
index 4374111f22..f9a251a8c7 100644
--- a/.github/workflows/docker-image.yml
+++ b/.github/workflows/docker-image.yml
@@ -2,16 +2,15 @@ name: DockerImage
 
 on:
   push:
-    branches:
-      - 'dev'
+    branches: [ dev ]
 
 jobs:
   docker:
-    runs-on: ubuntu-18.04
+    runs-on: ubuntu-20.04
     steps:
       -
         name: checkout
-        uses: actions/checkout@v2
+        uses: actions/checkout@v3
       -
         name: Set up Docker Buildx
         uses: docker/setup-buildx-action@v1
diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
index 2fbb9265be..011ccebadc 100644
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -13,10 +13,12 @@ jobs:
 
     steps:
       - name: Checkout
-        uses: actions/checkout@v2
+        uses: actions/checkout@v3
 
       - name: Setup Python
-        uses: actions/setup-python@v2
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.10'
 
       - name: Run Lint
-        uses: pre-commit/action@v2.0.0
+        uses: pre-commit/action@v3.0.0
diff --git a/.github/workflows/quicktest-dev-pr.yml b/.github/workflows/quicktest-dev-pr.yml
index d188007465..e2ba47ec29 100644
--- a/.github/workflows/quicktest-dev-pr.yml
+++ b/.github/workflows/quicktest-dev-pr.yml
@@ -11,43 +11,15 @@ jobs:
 
   test:
     name: Run quicktest on PR branch
-    runs-on: ubuntu-18.04
+    runs-on: ubuntu-20.04
 
     steps:
       - name: checkout
-        uses: actions/checkout@v2
-
-      - name: set up Docker Buildx
-        uses: docker/setup-buildx-action@v1
-
-      - name: cache Docker layers
-        uses: actions/cache@v2
-        with:
-          path: /tmp/.buildx-cache
-          key: ${{ runner.os }}-buildx-${{ github.sha }}
-          restore-keys: |
-            ${{ runner.os }}-buildx-
-
-      - name: Build and push
-        uses: docker/build-push-action@v2
-        with:
-          file: docker/Dockerfile.finn
-          context: .
-          push: false
-          load: true
-          tags: finn_gha
-          cache-from: type=local,src=/tmp/.buildx-cache
-          cache-to: type=local,dest=/tmp/.buildx-cache-new
-      -
-        # Temp fix
-        # https://github.com/docker/build-push-action/issues/252
-        # https://github.com/moby/buildkit/issues/1896
-        name: Move cache
-        run: |
-          rm -rf /tmp/.buildx-cache
-          mv /tmp/.buildx-cache-new /tmp/.buildx-cache
-
+        uses: actions/checkout@v3
 
       - name: DockerRunQuicktest
         run: |
-          docker run --init --hostname finn_gha -w $(pwd) -v $(pwd):$(pwd) -e FINN_BUILD_DIR=/tmp/finn_gha -e FINN_INST_NAME=finn_gha finn_gha quicktest.sh
+          export FINN_ROOT=$(pwd)
+          export FINN_BUILD_DIR=/tmp/finn_gha
+          export FINN_INST_NAME=finn_gha
+          ./run-docker.sh quicktest
diff --git a/.isort.cfg b/.isort.cfg
index 6cfe1c8919..5378b88fad 100644
--- a/.isort.cfg
+++ b/.isort.cfg
@@ -2,7 +2,7 @@
 line_length=88
 indent='    '
 skip=.tox,.venv,build,dist
-known_standard_library=setuptools,pkg_resources
+known_standard_library=setuptools
 known_test=pytest
 known_first_party=finn
 sections=FUTURE,STDLIB,TEST,THIRDPARTY,FIRSTPARTY,LOCALFOLDER
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index dfc83ba618..72a9688505 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -29,11 +29,11 @@
 exclude: '^docs/conf.py'
 
 default_language_version:
-    python: python3.8
+    python: python3.10
 
 repos:
 - repo: https://github.com/pre-commit/pre-commit-hooks
-  rev: v4.2.0
+  rev: v4.4.0
   hooks:
   - id: trailing-whitespace
     exclude: '\.dat$'
@@ -51,20 +51,21 @@ repos:
     args: ['--fix=no']
 
 - repo: https://github.com/PyCQA/isort
-  rev: 5.10.1
+  rev: 5.12.0
   hooks:
   - id: isort
 
 - repo: https://github.com/psf/black
-  rev: 22.3.0
+  rev: 23.3.0
   hooks:
   - id: black
     language_version: python3
+    args: [--line-length=100]
 
-- repo: https://gitlab.com/pycqa/flake8
-  rev: 3.9.2
+- repo: https://github.com/PyCQA/flake8
+  rev: 6.0.0
   hooks:
   - id: flake8
     # black-compatible flake-8 config
-    args: ['--max-line-length=88',  # black default
+    args: ['--max-line-length=100',  # black default
            '--extend-ignore=E203']  # E203 is not PEP8 compliant
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
index 3601fcdccf..478957be11 100644
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -35,7 +35,7 @@ sphinx:
    configuration: docs/finn/conf.py
 
 python:
-   version: 3.7
+   version: 3.8
    install:
     - method: pip
       path: .
diff --git a/AUTHORS.rst b/AUTHORS.rst
index d011ce3d7a..861b81924b 100644
--- a/AUTHORS.rst
+++ b/AUTHORS.rst
@@ -9,7 +9,7 @@ Contributors
 * Hendrik Borras (@HenniOVP)
 * Lucian Petrica (@quetric)
 * Tobias Alonso (@Tobi-Alonso)
-* Felix Paul Jentzsch (@felixpj)
+* Felix Paul Jentzsch (@fpjentzsch)
 * Mirza Mrahorovic (@mmrahorovic)
 * Suranga Mahesh (@surangamh)
 * Peter Lehnhardt (@pete-lennart)
@@ -26,3 +26,5 @@ Contributors
 * Aziz Bahri (@azizb-xlnx)
 * Fionn O'Donohoe (@fionnodonohoe-xlnx)
 * Matthias Gehre (@mgehre-amd)
+* Hugo Le Blevec (@hleblevec)
+* Patrick Geel (@patrickgeel)
diff --git a/README.md b/README.md
index 1b8efc8f19..2e1faf8f0c 100644
--- a/README.md
+++ b/README.md
@@ -28,7 +28,7 @@ Please see the [Getting Started](https://finn.readthedocs.io/en/latest/getting_s
 
 ## Documentation
 
-You can view the documentation on [readthedocs](https://finn.readthedocs.io) or build them locally using `python setup.py doc` from inside the Docker container. Additionally, there is a series of [Jupyter notebook tutorials](https://github.com/Xilinx/finn/tree/master/notebooks), which we recommend running from inside Docker for a better experience.
+You can view the documentation on [readthedocs](https://finn.readthedocs.io) or build them locally using `python setup.py doc` from inside the Docker container. Additionally, there is a series of [Jupyter notebook tutorials](https://github.com/Xilinx/finn/tree/main/notebooks), which we recommend running from inside Docker for a better experience.
 
 ## Community
 
@@ -67,4 +67,4 @@ The current implementation of the framework is based on the following publicatio
 ## Old version
 
 We previously released an early-stage prototype of a toolflow that took in Caffe-HWGQ binarized network descriptions and produced dataflow architectures. You can find it in the [v0.1](https://github.com/Xilinx/finn/tree/v0.1) branch in this repository.
-Please be aware that this version is deprecated and unsupported, and the master branch does not share history with that branch so it should be treated as a separate repository for all purposes.
+Please be aware that this version is deprecated and unsupported, and the main branch does not share history with that branch so it should be treated as a separate repository for all purposes.
diff --git a/custom_hls/lookup.hpp b/custom_hls/lookup.hpp
index 3001f6613e..037b038a09 100644
--- a/custom_hls/lookup.hpp
+++ b/custom_hls/lookup.hpp
@@ -26,14 +26,15 @@
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
- *******************************************************************************/
+*******************************************************************************/
+#ifndef LOOKUP_HPP
+#define LOOKUP_HPP
 
 #include <ap_int.h>
 #include <hls_stream.h>
 
-#ifndef LOOKUP_HPP
-#define LOOKUP_HPP
+#include "utils.hpp"
+
 
 template <
     unsigned NumEmbeddings,
@@ -57,4 +58,50 @@ void StreamingLookup(
     }
 }
 
+/**
+ * Lookup implementation over a table stored in AXI-accessible memory.
+ */
+template <
+	unsigned  EmbeddingSize,                            // Number of memory words per embedding
+	unsigned  EmbeddingAlign = clog2(EmbeddingSize),    // Alignment of entries = number of word index bits
+	typename  T_SRC,
+	typename  T_DST
+>
+void StreamingLookup_ext(
+	hls::stream<T_SRC> &in0,
+	hls::stream<T_DST> &out,
+	T_DST const *const  mem,
+	unsigned  const     size,
+	unsigned           &oob_count,
+	bool               &oob_irq
+) {
+#pragma HLS pipeline II=EmbeddingSize+9 style=flp
+
+	static unsigned  oob_count_li;
+	static unsigned  oob_count_int;
+#pragma HLS reset variable=oob_count_li
+#pragma HLS reset variable=oob_count_int
+
+	if(oob_count != oob_count_li) {
+		oob_count_int -= oob_count_li;
+		oob_count_li   = oob_count;
+	}
+	if(!in0.empty()) {
+		T_SRC const  x = in0.read();
+
+		// Map out-of-bounds inputs to an offset of zero and increment counter
+		bool  const  oob = x >= T_SRC(size);
+		ap_uint<T_SRC::width+EmbeddingAlign> const  ofs =
+			((oob? T_SRC(0) : x), ap_uint<EmbeddingAlign>(0));
+		oob_count_int += oob;
+
+		// Stream lookup data (burst inferred)
+		for(unsigned  i = 0; i < EmbeddingSize; i++) {
+#pragma HLS pipeline II=1 style=flp
+			out.write(mem[ofs+i]);
+		}
+	}
+	oob_count =  oob_count_int;
+	oob_irq   = (oob_count_int != 0);
+}
 #endif
diff --git a/docker/Dockerfile.finn b/docker/Dockerfile.finn
index a3f40d52ef..06dc109808 100644
--- a/docker/Dockerfile.finn
+++ b/docker/Dockerfile.finn
@@ -26,10 +26,10 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-FROM pytorch/pytorch:1.7.1-cuda11.0-cudnn8-runtime
+FROM ubuntu:jammy-20230126
 LABEL maintainer="Yaman Umuroglu <yamanu@xilinx.com>"
 
-ARG XRT_DEB_VERSION="xrt_202210.2.13.466_18.04-amd64-xrt"
+ARG XRT_DEB_VERSION="xrt_202220.2.14.354_22.04-amd64-xrt"
 
 WORKDIR /workspace
 
@@ -46,7 +46,6 @@ RUN apt-get update && \
     libsm6 \
     libxext6 \
     libxrender-dev \
-    verilator \
     nano \
     zsh \
     rsync \
@@ -58,10 +57,23 @@ RUN apt-get update && \
     unzip \
     zip \
     locales \
-    lsb-core
+    lsb-core \
+    python3 \
+    python-is-python3 \
+    python3-pip
 RUN echo "StrictHostKeyChecking no" >> /etc/ssh/ssh_config
 RUN locale-gen "en_US.UTF-8"
 
+# install Verilator from source to get the right version
+RUN apt-get install -y git perl make autoconf g++ flex bison ccache libgoogle-perftools-dev numactl perl-doc libfl2 libfl-dev zlib1g zlib1g-dev
+RUN git clone https://github.com/verilator/verilator
+RUN cd verilator && \
+    git checkout v4.224 && \
+    autoconf && \
+    ./configure && \
+    make -j4 && \
+    make install
+
 # install XRT
 RUN wget https://www.xilinx.com/bin/public/openDownload?filename=$XRT_DEB_VERSION.deb -O /tmp/$XRT_DEB_VERSION.deb
 RUN apt install -y /tmp/$XRT_DEB_VERSION.deb
@@ -72,22 +84,31 @@ RUN rm /tmp/$XRT_DEB_VERSION.deb
 COPY requirements.txt .
 RUN pip install -r requirements.txt
 RUN rm requirements.txt
+
+# install PyTorch
+RUN pip install torch==1.13.1 torchvision==0.14.1 torchaudio==0.13.1 --extra-index-url https://download.pytorch.org/whl/cu116
+
 # extra Python package dependencies (for testing and interaction)
-RUN pip install pygments==2.4.1
-RUN pip install ipykernel==5.5.5
-RUN pip install jupyter==1.0.0
+RUN pip install pygments==2.14.0
+RUN pip install ipykernel==6.21.2
+RUN pip install jupyter==1.0.0 --ignore-installed
 RUN pip install markupsafe==2.0.1
-RUN pip install matplotlib==3.3.1 --ignore-installed
+RUN pip install matplotlib==3.7.0  --ignore-installed
 RUN pip install pytest-dependency==0.5.1
-RUN pip install sphinx==5.0.2
-RUN pip install sphinx_rtd_theme==0.5.0
-RUN pip install pytest-xdist[setproctitle]==2.4.0
-RUN pip install pytest-parallel==0.1.0
+RUN pip install pytest-xdist[setproctitle]==3.2.0
+RUN pip install pytest-parallel==0.1.1
 RUN pip install "netron>=5.0.0"
-RUN pip install pandas==1.1.5
-RUN pip install scikit-learn==0.24.1
-RUN pip install tqdm==4.31.1
+RUN pip install pandas==1.5.3
+RUN pip install scikit-learn==1.2.1
+RUN pip install tqdm==4.64.1
 RUN pip install -e git+https://github.com/fbcotter/dataset_loading.git@0.0.4#egg=dataset_loading
+# these versions of pytest and associated plugins allow for stable collection of
+# test reports and code coverage reports in HTML
+RUN pip install pytest==6.2.5
+RUN pip install pytest-metadata==1.7.0
+RUN pip install pytest-html==3.0.0
+RUN pip install pytest-html-merger==0.0.8
+RUN pip install pytest-cov==4.1.0
 
 # extra dependencies from other FINN deps
 # installed in Docker image to make entrypoint script go faster
diff --git a/docker/finn_entrypoint.sh b/docker/finn_entrypoint.sh
index b5c702111a..6b33a4c9bc 100644
--- a/docker/finn_entrypoint.sh
+++ b/docker/finn_entrypoint.sh
@@ -54,8 +54,9 @@ recho () {
   echo -e "${RED}ERROR: $1${NC}"
 }
 
-# qonnx
-pip install --user -e ${FINN_ROOT}/deps/qonnx
+# qonnx (using workaround for https://github.com/pypa/pip/issues/7953)
+# to be fixed in future Ubuntu versions (https://bugs.launchpad.net/ubuntu/+source/setuptools/+bug/1994016)
+pip install --no-build-isolation --no-warn-script-location -e ${FINN_ROOT}/deps/qonnx
 # finn-experimental
 pip install --user -e ${FINN_ROOT}/deps/finn-experimental
 # brevitas
@@ -113,6 +114,27 @@ else
   yecho "If you need Vitis HLS, ensure HLS_PATH is set correctly and mounted into the Docker container."
 fi
 
+if [ -d "$FINN_ROOT/.Xilinx" ]; then
+  mkdir "$HOME/.Xilinx"
+  if [ -f "$FINN_ROOT/.Xilinx/HLS_init.tcl" ]; then
+    cp "$FINN_ROOT/.Xilinx/HLS_init.tcl" "$HOME/.Xilinx/"
+    gecho "Found HLS_init.tcl and copied to $HOME/.Xilinx/HLS_init.tcl"
+  else
+    yecho "Unable to find $FINN_ROOT/.Xilinx/HLS_init.tcl"
+  fi
+
+  if [ -f "$FINN_ROOT/.Xilinx/Vivado/Vivado_init.tcl" ]; then
+    mkdir "$HOME/.Xilinx/Vivado/"
+    cp "$FINN_ROOT/.Xilinx/Vivado/Vivado_init.tcl" "$HOME/.Xilinx/Vivado/"
+    gecho "Found Vivado_init.tcl and copied to $HOME/.Xilinx/Vivado/Vivado_init.tcl"
+  else
+    yecho "Unable to find $FINN_ROOT/.Xilinx/Vivado/Vivado_init.tcl"
+  fi
+else
+  echo "If you need to enable a beta device, ensure .Xilinx/HLS_init.tcl and/or .Xilinx/Vivado/Vivado_init.tcl are set correctly and mounted"
+  echo "See https://docs.xilinx.com/r/en-US/ug835-vivado-tcl-commands/Tcl-Initialization-Scripts"
+fi
+
 export PATH=$PATH:$HOME/.local/bin
 # execute the provided command(s) as root
 exec "$@"
diff --git a/docker/jenkins/Jenkinsfile b/docker/jenkins/Jenkinsfile
index e3e5b5f7f9..b19cbbccf1 100644
--- a/docker/jenkins/Jenkinsfile
+++ b/docker/jenkins/Jenkinsfile
@@ -1,46 +1,869 @@
-node {
-    def app
-    stage('Clone repository') {
-        /* Let's make sure we have the repository cloned to our workspace */
-        checkout scm
+pipeline {
+  agent none
+  parameters {
+    booleanParam(name: 'fpgadataflow', defaultValue: false, description: 'Run fpgadataflow tests')
+    booleanParam(name: 'sanity', defaultValue: true, description: 'Run sanity hardware and unit tests')
+    booleanParam(name: 'end2end', defaultValue: false, description: 'Run end2end tests')
+  }
+  stages {
+    stage('Sanity Tests') {
+      parallel {
+        stage('Sanity - Build Hardware') {
+          when {
+            expression { return params['sanity'] }
+          }
+          agent {
+            label 'finn-build'
+          }
+          environment {
+            TEST_NAME = "bnn_build_sanity"
+            FINN_HOST_BUILD_DIR = "${env.FINN_HOST_BUILD_DIR}/${env.TEST_NAME}"
+          }
+          steps {
+            catchError(stageResult: 'FAILURE') {
+              script {
+                // Creates dir in finn clone to store build files for stashing
+                sh "mkdir -p ${env.TEST_NAME}"
+                cleanPreviousBuildFiles(env.FINN_HOST_BUILD_DIR)
+
+                // Pass in the marker to run with pytest and the XML test results filename
+                runDockerPytestWithMarker("sanity_bnn", "${env.TEST_NAME}", '')
+
+                // Find the board's build files (bitstreams/xclbins) and zip for use on the boards themselves
+                findCopyZip("Pynq-Z1", env.FINN_HOST_BUILD_DIR, env.TEST_NAME, "sanity_PynqZ1_zip")
+                findCopyZip("ZCU104", env.FINN_HOST_BUILD_DIR, env.TEST_NAME, "sanity_ZCU104_zip")
+                findCopyZip("KV260_SOM", env.FINN_HOST_BUILD_DIR, env.TEST_NAME, "sanity_KV260_SOM_zip")
+                findCopyZip("U250", env.FINN_HOST_BUILD_DIR, env.TEST_NAME, "sanity_U250_zip")
+
+                // Stash the test results file(s)
+                stash name: "${env.TEST_NAME}", includes: "${env.TEST_NAME}.xml,${env.TEST_NAME}.html"
+
+                // Use an env variable to help collect test results later in pipeline
+                env.BNN_BUILD_SANITY = "SUCCESS"
+              }
+            }
+          }
+        }
+        stage('Sanity - Unit Tests') {
+          when {
+            expression { params['sanity'] }
+          }
+          agent {
+            label 'finn-build'
+          }
+          environment {
+            TEST_NAME = "sanity_ut"
+            FINN_HOST_BUILD_DIR = "${env.FINN_HOST_BUILD_DIR}/${env.TEST_NAME}"
+          }
+          steps {
+            catchError(stageResult: 'FAILURE') {
+              script {
+                cleanPreviousBuildFiles(env.FINN_HOST_BUILD_DIR)
+
+                // Multiple markers with pytest needs its own script
+                createMultiMarkerScript("util or brevitas_export or streamline or transform or notebooks", "${env.TEST_NAME}", "--cov --cov-report=html:coverage_sanity_ut")
+                sh './run-docker.sh ./run-tests.sh'
+
+                // Stash the test results file(s)
+                stash name: env.TEST_NAME, includes: "${env.TEST_NAME}.xml,${env.TEST_NAME}.html"
+
+                // Use an env variable to help collect test results later in pipeline
+                env.SANITY_UT = "SUCCESS"
+
+                // Archive coverage report if successful
+                archiveSuccessfulStage(env.SANITY_UT, "coverage_sanity_ut")
+              }
+            }
+          }
+        }
+        stage('Sanity - fpgadataflow Tests') {
+          when {
+            expression { params['fpgadataflow'] }
+          }
+          agent {
+            label 'finn-build'
+          }
+          environment {
+            TEST_NAME = "fpgadataflow"
+            FINN_HOST_BUILD_DIR = "${env.FINN_HOST_BUILD_DIR}/${env.TEST_NAME}"
+          }
+          steps {
+            catchError(stageResult: 'FAILURE') {
+              script {
+                cleanPreviousBuildFiles(env.FINN_HOST_BUILD_DIR)
+
+                // Pass in the marker to run with pytest and the XML test results filename
+                runDockerPytestWithMarker("fpgadataflow", "${env.TEST_NAME}", "--cov --cov-report=html:coverage_fpgadataflow")
+
+                // Stash the test results file(s)
+                stash name: env.TEST_NAME, includes: "${env.TEST_NAME}.xml,${env.TEST_NAME}.html"
+
+                // Use an env variable to help collect test results later in pipeline
+                env.FPGADATAFLOW_RESULT = "SUCCESS"
+
+                // Archive coverage report if successful
+                archiveSuccessfulStage(env.FPGADATAFLOW_RESULT, "coverage_fpgadataflow")
+              }
+            }
+          }
+        }
+      }
     }
-    withEnv([
-        "FINN_XILINX_PATH=/proj/xbuilds/SWIP/2022.1_0420_0327/installs/lin64",
-        "FINN_XILINX_VERSION=2022.1",
-        "FINN_DOCKER_TAG=xilinx/finn:jenkins",
-        "FINN_HOST_BUILD_DIR=/scratch/users/finn_ci",
-        "PLATFORM_REPO_PATHS=/opt/xilinx/platforms"
-    ]){
-        parallel firstBranch: {
-            stage('Brevitas export') {
-                dir("${env.WORKSPACE}") {
-                sh("bash run-docker.sh python setup.py test --addopts -mbrevitas_export")
+    stage('End2end - Build Hardware') {
+      parallel {
+        stage('End2end') {
+          when {
+            expression { params['end2end'] }
+          }
+          agent {
+            label 'finn-build'
+          }
+          environment {
+            TEST_NAME = "end2end"
+            FINN_HOST_BUILD_DIR = "${env.FINN_HOST_BUILD_DIR}/${env.TEST_NAME}"
+          }
+          steps {
+            catchError(stageResult: 'FAILURE') {
+              script {
+                // Delete any build files from a previous build
+                cleanPreviousBuildFiles(env.FINN_HOST_BUILD_DIR)
+
+                // Pass in the marker to run with pytest and the XML test results filename
+                runDockerPytestWithMarker(env.TEST_NAME, "${env.TEST_NAME}", '')
+
+                // Stash the test results file(s)
+                stash name: env.TEST_NAME, includes: "${env.TEST_NAME}.xml,${env.TEST_NAME}.html"
+
+                // Use an env variable to help collect test results later in pipeline
+                env.END2END_RESULT = "SUCCESS"
+              }
+            }
+          }
+        }
+        stage('BNN end2end - U250') {
+          when {
+            expression { return params['end2end'] }
+          }
+          agent {
+            label 'finn-build'
+          }
+          environment {
+            BOARD = "U250"
+            TEST_NAME = "bnn_build_full"
+            FINN_HOST_BUILD_DIR = "${env.FINN_HOST_BUILD_DIR}/${env.TEST_NAME}_${env.BOARD}"
+          }
+          steps {
+            catchError(stageResult: 'FAILURE') {
+              script {
+                // Creates dir in finn clone to store build files for stashing
+                sh "mkdir -p ${env.TEST_NAME}"
+                cleanPreviousBuildFiles(env.FINN_HOST_BUILD_DIR)
+
+                // Pass in the marker to run with pytest and the XML test results filename
+                runDockerPytestWithMarker("bnn_u250", "${env.TEST_NAME}_${env.BOARD}", '')
+                findCopyZip(env.BOARD, env.FINN_HOST_BUILD_DIR, env.TEST_NAME, "${env.BOARD}_zip")
+
+                // Stash the test results file(s)
+                stash name: "${env.TEST_NAME}_${env.BOARD}", includes: "${env.TEST_NAME}_${env.BOARD}.xml,${env.TEST_NAME}_${env.BOARD}.html"
+
+                // Use an env variable to help collect test results later in pipeline
+                env.BNN_BUILD_U250 = "SUCCESS"
+              }
+            }
+          }
+        }
+        stage('BNN end2end - Pynq-Z1') {
+          when {
+            expression { return params['end2end'] }
+          }
+          agent {
+            label 'finn-build'
+          }
+          environment {
+            BOARD = "Pynq-Z1"
+            TEST_NAME = "bnn_build_full"
+            FINN_HOST_BUILD_DIR = "${env.FINN_HOST_BUILD_DIR}/${env.TEST_NAME}_${env.BOARD}"
+          }
+          steps {
+            catchError(stageResult: 'FAILURE') {
+              script {
+                // Creates dir in finn clone to store build files for stashing
+                sh "mkdir -p ${env.TEST_NAME}"
+                cleanPreviousBuildFiles(env.FINN_HOST_BUILD_DIR)
+
+                // Pass in the marker to run with pytest and the XML test results filename
+                runDockerPytestWithMarker("bnn_pynq", "${env.TEST_NAME}_${env.BOARD}", '')
+                findCopyZip(env.BOARD, env.FINN_HOST_BUILD_DIR, env.TEST_NAME, "PynqZ1_zip")
+
+                // Stash the test results file(s)
+                stash name: "${env.TEST_NAME}_PynqZ1", includes: "${env.TEST_NAME}_${env.BOARD}.xml,${env.TEST_NAME}_${env.BOARD}.html"
+
+                // Use an env variable to help collect test results later in pipeline
+                env.BNN_BUILD_PYNQZ1 = "SUCCESS"
+              }
+            }
+          }
+        }
+        stage('BNN end2end - ZCU104') {
+          when {
+            expression { return params['end2end'] }
+          }
+          agent {
+            label 'finn-build'
+          }
+          environment {
+            BOARD = "ZCU104"
+            TEST_NAME = "bnn_build_full"
+            FINN_HOST_BUILD_DIR = "${env.FINN_HOST_BUILD_DIR}/${env.TEST_NAME}_${env.BOARD}"
+          }
+          steps {
+            catchError(stageResult: 'FAILURE') {
+              script {
+                // Creates dir in finn clone to store build files for stashing
+                sh "mkdir -p ${env.TEST_NAME}"
+                cleanPreviousBuildFiles(env.FINN_HOST_BUILD_DIR)
+
+                // Pass in the marker to run with pytest and the XML test results filename
+                runDockerPytestWithMarker("bnn_zcu104", "${env.TEST_NAME}_${env.BOARD}", '')
+                findCopyZip(env.BOARD, env.FINN_HOST_BUILD_DIR, env.TEST_NAME, "${env.BOARD}_zip")
+
+                // Stash the test results file(s)
+                stash name: "${env.TEST_NAME}_${env.BOARD}", includes: "${env.TEST_NAME}_${env.BOARD}.xml,${env.TEST_NAME}_${env.BOARD}.html"
+
+                // Use an env variable to help collect test results later in pipeline
+                env.BNN_BUILD_ZCU104 = "SUCCESS"
+              }
+            }
+          }
+        }
+        stage('BNN end2end - KV260_SOM') {
+          when {
+            expression { return params['end2end'] }
+          }
+          agent {
+            label 'finn-build'
+          }
+          environment {
+            BOARD = "KV260_SOM"
+            TEST_NAME = "bnn_build_full"
+            FINN_HOST_BUILD_DIR = "${env.FINN_HOST_BUILD_DIR}/${env.TEST_NAME}_${env.BOARD}"
+          }
+          steps {
+            catchError(stageResult: 'FAILURE') {
+              script {
+                // Creates dir in finn clone to store build files for stashing
+                sh "mkdir -p ${env.TEST_NAME}"
+                cleanPreviousBuildFiles(env.FINN_HOST_BUILD_DIR)
+
+                // Pass in the marker to run with pytest and the XML test results filename
+                runDockerPytestWithMarker("bnn_kv260", "${env.TEST_NAME}_${env.BOARD}", '')
+                findCopyZip(env.BOARD, env.FINN_HOST_BUILD_DIR, env.TEST_NAME, "${env.BOARD}_zip")
+
+                // Stash the test results file(s)
+                stash name: "${env.TEST_NAME}_${env.BOARD}", includes: "${env.TEST_NAME}_${env.BOARD}.xml,${env.TEST_NAME}_${env.BOARD}.html"
+
+                // Use an env variable to help collect test results later in pipeline
+                env.BNN_BUILD_KV260_SOM = "SUCCESS"
+              }
+            }
+          }
+        }
+      }
+    }
+    stage('Sanity & BNN end2end - Setup Hardware Tests') {
+      when {
+        expression { return params['sanity'] }
+      }
+      agent {
+        label 'finn-build'
+      }
+      steps {
+        script {
+          // Check which boards are online before running HW tests
+          env.ALVEO_HOST_ONLINE = isNodeOnline('finn-u250')
+          env.PYNQ_ONLINE = isNodeOnline('finn-pynq')
+          env.ZCU104_ONLINE = isNodeOnline('finn-zcu104')
+          env.KV260_ONLINE = isNodeOnline('finn-kv260')
+
+          // Stash the HW test scripts to be used on slave nodes
+          dir('docker/jenkins') {
+            stash name: 'bnn_test_files', includes: 'test_bnn_hw_pytest.py'
+          }
+        }
+      }
+    }
+    stage('Sanity - Run Hardware Tests') {
+      parallel {
+        stage('BNN Sanity - U250') {
+          when {
+            // beforeAgent set to 'true' to prevent an offline agent hanging the stage
+            beforeAgent true
+            expression { return (env.ALVEO_HOST_ONLINE == 'true' && params['sanity'] && env.BNN_BUILD_SANITY == 'SUCCESS') }
+          }
+          agent {
+            label 'finn-u250'
+          }
+          environment {
+            BOARD = 'U250'
+          }
+          steps {
+            catchError(stageResult: 'FAILURE') {
+              script {
+                // Clean any files from a previous run
+                cleanPreviousBuildFiles("${env.BOARD}*")
+
+                // Get the test files
+                unstash name: "sanity_${env.BOARD}_zip"
+                sh "unzip -o ${env.BOARD}.zip"
+
+                dir(env.BOARD) {
+                  // Get the scripts necessary for running hw tests
+                  unstash name: 'bnn_test_files'
+
+                  // Create test script
+                  createTestScript(env.BOARD, env.BOARD, "sanity_bnn_test_hw_${env.BOARD}")
+
+                  // Use an env variable to help collect test results later in pipeline
+                  env.SANITY_BNN_TEST_U250 = "SUCCESS"
+
+                  // Execute the script
+                  sh './run-tests.sh'
+                }
+              }
+            }
+          }
+          post {
+            always {
+              dir(env.BOARD) {
+                // Collect the results file on the slave node by stashing
+                stash name: "xml_sanity_bnn_test_${env.BOARD}", includes: "sanity_bnn_test_hw_${env.BOARD}.xml,sanity_bnn_test_hw_${env.BOARD}.html"
+              }
+            }
+          }
+        }
+        stage('BNN Sanity - Pynq-Z1') {
+          when {
+            // beforeAgent set to 'true' to prevent an offline agent hanging the stage
+            beforeAgent true
+            expression { return (env.PYNQ_ONLINE == 'true' && params['sanity'] && env.BNN_BUILD_SANITY == 'SUCCESS') }
+          }
+          agent {
+            label 'finn-pynq'
+          }
+          environment {
+            BOARD = 'Pynq-Z1'
+            USER_CREDENTIALS = credentials('pynq-z1-credentials')
+          }
+          steps {
+            catchError(stageResult: 'FAILURE') {
+              script {
+                // Clean any files from a previous run
+                cleanPreviousBoardBuildFiles("${env.BOARD}*")
+
+                // Get the test files
+                unstash name: "sanity_PynqZ1_zip"
+                sh "unzip -o ${env.BOARD}.zip"
+
+                dir(env.BOARD) {
+                  // Get the scripts necessary for running hw tests
+                  unstash name: 'bnn_test_files'
+
+                  // Create test script
+                  // The marker here omits the '-Z1' as '-' is a special character
+                  // that will not work with Pytest
+                  createTestScript(env.BOARD, 'Pynq', "sanity_bnn_test_hw_${env.BOARD}")
+
+                  // Use an env variable to help collect test results later in pipeline
+                  env.SANITY_BNN_TEST_PYNQZ1 = "SUCCESS"
+
+                  // Execute the script as the root user - needed for zynq platforms
+                  sh 'echo $USER_CREDENTIALS_PSW | sudo -S ./run-tests.sh'
+                }
+              }
+            }
+          }
+          post {
+            always {
+              // Get test result file and delete test files on the board
+              dir(env.BOARD) {
+                // Collect the results file on the slave node by stashing
+                stash name: "xml_sanity_bnn_test_PynqZ1", includes: "sanity_bnn_test_hw_${env.BOARD}.xml,sanity_bnn_test_hw_${env.BOARD}.html"
+              }
+            }
+          }
+        }
+        stage('BNN Sanity - ZCU104') {
+          when {
+            // beforeAgent set to 'true' to prevent an offline agent hanging the stage
+            beforeAgent true
+            expression { return (env.ZCU104_ONLINE == 'true' && params['sanity'] && env.BNN_BUILD_SANITY == 'SUCCESS') }
+          }
+          agent {
+            label 'finn-zcu104'
+          }
+          environment {
+            BOARD = 'ZCU104'
+            USER_CREDENTIALS = credentials('pynq-z1-credentials')
+          }
+          steps {
+            catchError(stageResult: 'FAILURE') {
+              script {
+                // Clean any files from a previous run
+                cleanPreviousBoardBuildFiles("${env.BOARD}*")
+
+                // Get the test files
+                unstash name: "sanity_${env.BOARD}_zip"
+                sh "unzip -o ${env.BOARD}.zip"
+
+                dir(env.BOARD) {
+                  // Get the scripts necessary for running hw tests
+                  unstash name: 'bnn_test_files'
+
+                  // Create test script
+                  createTestScript(env.BOARD, env.BOARD, "sanity_bnn_test_hw_${env.BOARD}")
+
+                  // Use an env variable to help collect test results later in pipeline
+                  env.SANITY_BNN_TEST_ZCU104 = "SUCCESS"
+
+                  // Execute the script as the root user - needed for zynq platforms
+                  sh 'echo $USER_CREDENTIALS_PSW | sudo -S ./run-tests.sh'
+                }
+              }
+            }
+          }
+          post {
+            always {
+              // Get test result file and delete test files on the board
+              dir(env.BOARD) {
+                // Collect the results file on the slave node by stashing
+                stash name: "xml_sanity_bnn_test_${env.BOARD}", includes: "sanity_bnn_test_hw_${env.BOARD}.xml,sanity_bnn_test_hw_${env.BOARD}.html"
+              }
+            }
+          }
+        }
+        stage('BNN Sanity - KV260_SOM') {
+          when {
+            // beforeAgent set to 'true' to prevent an offline agent hanging the stage
+            beforeAgent true
+            expression { return (env.KV260_ONLINE == 'true' && params['sanity'] && env.BNN_BUILD_SANITY == 'SUCCESS') }
+          }
+          agent {
+            label 'finn-kv260'
+          }
+          environment {
+            BOARD = 'KV260_SOM'
+            USER_CREDENTIALS = credentials('user-ubuntu-credentials')
+          }
+          steps {
+            catchError(stageResult: 'FAILURE') {
+              script {
+                // Clean any files from a previous run
+                cleanPreviousBoardBuildFiles("${env.BOARD}*")
+
+                // Get the test files
+                unstash name: "sanity_${env.BOARD}_zip"
+                sh "unzip -o ${env.BOARD}.zip"
+
+                dir(env.BOARD) {
+                  // Get the scripts necessary for running hw tests
+                  unstash name: 'bnn_test_files'
+
+                  // Create test script
+                  createTestScript(env.BOARD, env.BOARD, "sanity_bnn_test_hw_${env.BOARD}")
+
+                  // Use an env variable to help collect test results later in pipeline
+                  env.SANITY_BNN_TEST_KV260_SOM = "SUCCESS"
+
+                  // Execute the script as the root user - needed for zynq platforms
+                  sh 'echo $USER_CREDENTIALS_PSW | sudo -S ./run-tests.sh'
                 }
+              }
             }
-        }, secondBranch: {
-            stage('Streamlining transformations') {
-                dir("${env.WORKSPACE}") {
-                sh("bash run-docker.sh python setup.py test --addopts -mstreamline")
+          }
+           post {
+            always {
+              // Get test result file and delete test files on the board
+              dir(env.BOARD) {
+                // Collect the results file on the slave node by stashing
+                stash name: "xml_sanity_bnn_test_${env.BOARD}", includes: "sanity_bnn_test_hw_${env.BOARD}.xml,sanity_bnn_test_hw_${env.BOARD}.html"
+              }
+            }
+          }
+        }
+      }
+    }
+    stage('End2end - Run Hardware Tests') {
+      parallel {
+        stage('BNN end2end - U250') {
+          when {
+            // beforeAgent set to 'true' to prevent an offline agent hanging the stage
+            beforeAgent true
+            expression { return (env.ALVEO_HOST_ONLINE == 'true' && params['end2end'] && env.BNN_BUILD_U250 == 'SUCCESS') }
+          }
+          agent {
+            label 'finn-u250'
+          }
+          environment {
+            BOARD = 'U250'
+          }
+          steps {
+            catchError(stageResult: 'FAILURE') {
+              script {
+                // Clean any files from a previous run
+                cleanPreviousBuildFiles("${env.BOARD}*")
+
+                // Get the test files
+                unstash name: "${env.BOARD}_zip"
+                sh "unzip -o ${env.BOARD}.zip"
+
+                dir(env.BOARD) {
+                  // Get the scripts necessary for running hw tests
+                  unstash name: 'bnn_test_files'
+
+                  // Create test script
+                  createTestScript(env.BOARD, env.BOARD, "bnn_test_hw_${env.BOARD}")
+
+                  // Use an env variable to help collect test results later in pipeline
+                  env.BNN_TEST_U250 = "SUCCESS"
+
+                  // Execute the script
+                  sh './run-tests.sh'
                 }
+              }
+            }
+          }
+          post {
+            always {
+              dir(env.BOARD) {
+                // Collect the results file on the slave node by stashing
+                stash name: "xml_bnn_test_${env.BOARD}", includes: "bnn_test_hw_${env.BOARD}.xml,bnn_test_hw_${env.BOARD}.html"
+              }
             }
-        }, thirdBranch: {
-            stage('Util functions') {
-                dir("${env.WORKSPACE}") {
-                sh("bash run-docker.sh python setup.py test --addopts -mutil")
+          }
+        }
+        stage('BNN end2end - Pynq-Z1') {
+          when {
+            // beforeAgent set to 'true' to prevent an offline agent hanging the stage
+            beforeAgent true
+            expression { return (env.PYNQ_ONLINE == 'true' && params['end2end'] && env.BNN_BUILD_PYNQZ1 == 'SUCCESS') }
+          }
+          agent {
+            label 'finn-pynq'
+          }
+          environment {
+            BOARD = 'Pynq-Z1'
+            USER_CREDENTIALS = credentials('pynq-z1-credentials')
+          }
+          steps {
+            catchError(stageResult: 'FAILURE') {
+              script {
+                // Clean any files from a previous run
+                cleanPreviousBoardBuildFiles("${env.BOARD}*")
+
+                // Get the test files
+                unstash name: "PynqZ1_zip"
+                sh "unzip -o ${env.BOARD}.zip"
+
+                dir(env.BOARD) {
+                  // Get the scripts necessary for running hw tests
+                  unstash name: 'bnn_test_files'
+
+                  // Create test script
+                  // The marker here omits the '-Z1' as '-' is a special character
+                  // that will not work with Pytest
+                  createTestScript(env.BOARD, 'Pynq', "bnn_test_hw_${env.BOARD}")
+
+                  // Use an env variable to help collect test results later in pipeline
+                  env.BNN_TEST_PYNQZ1 = "SUCCESS"
+
+                  // Execute the script as the root user - needed for zynq platforms
+                  sh 'echo $USER_CREDENTIALS_PSW | sudo -S ./run-tests.sh'
                 }
+              }
+            }
+          }
+          post {
+            always {
+              // Get test result file and delete test files on the board
+              dir(env.BOARD) {
+                // Collect the results file on the slave node by stashing
+                stash name: "xml_bnn_test_PynqZ1", includes: "bnn_test_hw_${env.BOARD}.xml,bnn_test_hw_${env.BOARD}.html"
+              }
             }
-        }, fourthBranch: {
-            stage('General transformations') {
-                dir("${env.WORKSPACE}") {
-                sh("bash run-docker.sh python setup.py test --addopts -mtransform")
+          }
+        }
+        stage('BNN end2end - ZCU104') {
+          when {
+            // beforeAgent set to 'true' to prevent an offline agent hanging the stage
+            beforeAgent true
+            expression { return (env.ZCU104_ONLINE == 'true' && params['end2end'] && env.BNN_BUILD_ZCU104 == 'SUCCESS') }
+          }
+          agent {
+            label 'finn-zcu104'
+          }
+          environment {
+            BOARD = 'ZCU104'
+            USER_CREDENTIALS = credentials('pynq-z1-credentials')
+          }
+          steps {
+            catchError(stageResult: 'FAILURE') {
+              script {
+                // Clean any files from a previous run
+                cleanPreviousBoardBuildFiles("${env.BOARD}*")
+
+                // Get the test files
+                unstash name: "${env.BOARD}_zip"
+                sh "unzip -o ${env.BOARD}.zip"
+
+                dir(env.BOARD) {
+                  // Get the scripts necessary for running hw tests
+                  unstash name: 'bnn_test_files'
+
+                  // Create test script
+                  createTestScript(env.BOARD, env.BOARD, "bnn_test_hw_${env.BOARD}")
+
+                  // Use an env variable to help collect test results later in pipeline
+                  env.BNN_TEST_ZCU104 = "SUCCESS"
+
+                  // Execute the script as the root user - needed for zynq platforms
+                  sh 'echo $USER_CREDENTIALS_PSW | sudo -S ./run-tests.sh'
                 }
+              }
+            }
+          }
+          post {
+            always {
+              // Get test result file and delete test files on the board
+              dir(env.BOARD) {
+                // Collect the results file on the slave node by stashing
+                stash name: "xml_bnn_test_${env.BOARD}", includes: "bnn_test_hw_${env.BOARD}.xml,bnn_test_hw_${env.BOARD}.html"
+              }
             }
-        }, fifthBranch: {
-            stage('Fpgadataflow transformations and simulations') {
-                dir("${env.WORKSPACE}") {
-                sh("bash run-docker.sh python setup.py test --addopts -mfpgadataflow")
+          }
+        }
+        stage('BNN end2end - KV260_SOM') {
+          when {
+            // beforeAgent set to 'true' to prevent an offline agent hanging the stage
+            beforeAgent true
+            expression { return (env.KV260_ONLINE == 'true' && params['end2end'] && env.BNN_BUILD_KV260_SOM == 'SUCCESS') }
+          }
+          agent {
+            label 'finn-kv260'
+          }
+          environment {
+            BOARD = 'KV260_SOM'
+            USER_CREDENTIALS = credentials('user-ubuntu-credentials')
+          }
+          steps {
+            catchError(stageResult: 'FAILURE') {
+              script {
+                // Clean any files from a previous run
+                cleanPreviousBoardBuildFiles("${env.BOARD}*")
+
+                // Get the test files
+                unstash name: "${env.BOARD}_zip"
+                sh "unzip -o ${env.BOARD}.zip"
+
+                dir(env.BOARD) {
+                  // Get the scripts necessary for running hw tests
+                  unstash name: 'bnn_test_files'
+
+                  // Create test script
+                  createTestScript(env.BOARD, env.BOARD, "bnn_test_hw_${env.BOARD}")
+
+                  // Use an env variable to help collect test results later in pipeline
+                  env.BNN_TEST_KV260_SOM = "SUCCESS"
+
+                  // Execute the script as the root user - needed for zynq platforms
+                  sh 'echo $USER_CREDENTIALS_PSW | sudo -S ./run-tests.sh'
                 }
+              }
+            }
+          }
+          post {
+            always {
+              // Get test result file and delete test files on the board
+              dir(env.BOARD) {
+                // Collect the results file on the slave node by stashing
+                stash name: "xml_bnn_test_${env.BOARD}", includes: "bnn_test_hw_${env.BOARD}.xml,bnn_test_hw_${env.BOARD}.html"
+              }
             }
+          }
         }
+      }
     }
+    stage('Check Stage Results') {
+      agent {
+        label 'finn-build'
+      }
+      steps {
+        catchError(buildResult: 'SUCCESS') {
+          script {
+            checkAllBoards()
+          }
+        }
+      }
+      post {
+        always {
+          script {
+            sh 'mkdir -p reports'
+            cleanPreviousBuildFiles('reports')
+            dir('reports') {
+              // Only unstash for stages that ran
+              unstashSuccessfulStage(env.SANITY_UT, "sanity_ut")
+              unstashSuccessfulStage(env.FPGADATAFLOW_RESULT, "fpgadataflow")
+              unstashSuccessfulStage(env.BNN_BUILD_SANITY, "bnn_build_sanity")
+              unstashSuccessfulStage(env.SANITY_BNN_TEST_U250, "xml_sanity_bnn_test_U250")
+              unstashSuccessfulStage(env.SANITY_BNN_TEST_PYNQZ1, "xml_sanity_bnn_test_PynqZ1")
+              unstashSuccessfulStage(env.SANITY_BNN_TEST_ZCU104, "xml_sanity_bnn_test_ZCU104")
+              unstashSuccessfulStage(env.SANITY_BNN_TEST_KV260_SOM, "xml_sanity_bnn_test_KV260_SOM")
+              unstashSuccessfulStage(env.END2END_RESULT, "end2end")
+              unstashSuccessfulStage(env.BNN_BUILD_U250, "bnn_build_full_U250")
+              unstashSuccessfulStage(env.BNN_BUILD_PYNQZ1, "bnn_build_full_PynqZ1")
+              unstashSuccessfulStage(env.BNN_BUILD_ZCU104, "bnn_build_full_ZCU104")
+              unstashSuccessfulStage(env.BNN_BUILD_KV260_SOM, "bnn_build_full_KV260_SOM")
+              unstashSuccessfulStage(env.BNN_TEST_U250, "xml_bnn_test_U250")
+              unstashSuccessfulStage(env.BNN_TEST_PYNQZ1, "xml_bnn_test_PynqZ1")
+              unstashSuccessfulStage(env.BNN_TEST_ZCU104, "xml_bnn_test_ZCU104")
+              unstashSuccessfulStage(env.BNN_TEST_KV260_SOM, "xml_bnn_test_KV260_SOM")
+            }
+
+            // Combine individual HTML files to one single report
+            sh './run-docker.sh pytest_html_merger -i reports/ -o reports/test_report_final.html'
+
+            // Archive the XML & HTML test results
+            archiveArtifacts artifacts: "reports/*.xml"
+            archiveArtifacts artifacts: "reports/*.html"
+
+            // Plot what XML files were created during the test run
+            junit 'reports/*.xml'
+          }
+        }
+      }
+    }
+  }
+}
+
+void cleanPreviousBuildFiles(String buildDir) {
+  // Delete any build files from a previous build
+  // Previous build folders affect findCopyZip() and can cause the stage to fail
+  if (!buildDir.empty) {
+      sh "rm -rf ${buildDir}"
+  }
+}
+
+void cleanPreviousBoardBuildFiles(String boardDir) {
+  // Delete any board build files
+  // Specifically used on Pynq boards which require sudo to delete
+  if (!boardDir.empty) {
+      sh "echo $USER_CREDENTIALS_PSW | sudo -S rm -rf ${boardDir}*"
+  }
+}
+
+void createMultiMarkerScript(String markers, String testResultsFilename, String additionalOptions) {
+  // Passing multiple markers when running ./run-docker.sh does not work with bash.
+  // Therefore, create a script to maintain the single quotes that surround the markers
+  sh """echo "#!/bin/bash
+python -m pytest -m \'${markers}\' --junitxml=${testResultsFilename}.xml --html=${testResultsFilename}.html --self-contained-html ${additionalOptions}" >> run-tests.sh
+    """
+
+  // Give permissions to script
+  sh 'chmod 777 run-tests.sh'
+}
+
+void runDockerPytestWithMarker(String marker, String testResultsFilename, String additionalOptions) {
+  sh """./run-docker.sh python -m pytest -m ${marker} --junitxml=${testResultsFilename}.xml --html=${testResultsFilename}.html --self-contained-html ${additionalOptions}"""
+}
+
+def findBoardBuildFiles(String searchDir, String dirToFind) {
+  def result = sh(script: "find $searchDir -type d -name \"$dirToFind*\"", returnStdout: true).trim()
+  if (result.empty) {
+      error "Directory containing '$dirToFind' not found."
+  }
+	return result
+}
+
+void findCopyZip(String board, String findDir, String copyDir, String stashName) {
+  def buildDir = findBoardBuildFiles(findDir, "hw_deployment_${board}")
+  sh "cp -r ${buildDir}/${board} ${copyDir}/"
+  dir(copyDir) {
+    sh "zip -r ${board}.zip ${board}/"
+    stash name: stashName, includes: "${board}.zip"
+  }
+}
+
+void createTestScript(String board, String marker, String testResultsFilename) {
+   if(board == "U250")
+    sh """echo "#!/bin/bash
+. /opt/xilinx/xrt/setup.sh
+. ${VENV_ACTIVATE}
+python -m pytest -m ${marker} --junitxml=${testResultsFilename}.xml --html=${testResultsFilename}.html --self-contained-html" >> run-tests.sh
+    """
+  else
+    sh """echo "#!/bin/bash
+. /etc/profile.d/pynq_venv.sh
+. /etc/profile.d/xrt_setup.sh
+python -m pytest -m ${marker} --junitxml=${testResultsFilename}.xml --html=${testResultsFilename}.html --self-contained-html" >> run-tests.sh
+    """
+
+  // Give permissions to script
+  sh 'chmod 777 run-tests.sh'
+}
+
+def isNodeOnline(String labelName) {
+  Label label = Jenkins.instance.getLabel(labelName)
+  def agentOnline = false
+
+  if (label) {
+    List<Node> nodes = Jenkins.instance.getNodes()
+
+    nodes.each { node ->
+      if (node.getAssignedLabels().contains(label)) {
+        def computer = node.toComputer()
+        if (computer && computer.isOnline()) {
+          agentOnline = true
+        } else {
+          echo """Agent ${node.displayName} is offline"""
+        }
+      }
+    }
+  } else {
+    echo """Node with label ${labelName} not found"""
+  }
+
+  return agentOnline
+}
+
+def checkAllBoards() {
+  def overallResult = true
+
+  if (env.PYNQ_ONLINE == 'false') {
+    overallResult = false
+  }
+
+  if (env.ALVEO_HOST_ONLINE == 'false') {
+    overallResult = false
+  }
+
+  if (env.KV260_ONLINE == 'false') {
+    overallResult = false
+  }
+
+  if (env.ZCU104_ONLINE == 'false') {
+    overallResult = false
+  }
+
+  return overallResult
+}
+
+void unstashSuccessfulStage(String stageEnvVariableSet, String stashName) {
+  if (stageEnvVariableSet) {
+    unstash stashName
+  }
+}
+
+void archiveSuccessfulStage(String stageEnvVariableSet, String folder) {
+  if (stageEnvVariableSet) {
+    archiveArtifacts artifacts: "${folder}/**/*"
+  }
 }
diff --git a/docker/jenkins/Jenkinsfile_CI b/docker/jenkins/Jenkinsfile_CI
new file mode 100644
index 0000000000..5e7d5f1475
--- /dev/null
+++ b/docker/jenkins/Jenkinsfile_CI
@@ -0,0 +1,46 @@
+node('finn-build || built-in') {
+    def app
+    stage('Clone repository') {
+        /* Let's make sure we have the repository cloned to our workspace */
+        checkout scm
+    }
+    withEnv([
+        "FINN_XILINX_PATH=/proj/xbuilds/SWIP/2022.2_1014_8888/installs/lin64",
+        "FINN_XILINX_VERSION=2022.2",
+        "FINN_DOCKER_TAG=xilinx/finn:jenkins",
+        "FINN_HOST_BUILD_DIR=/scratch/users/finn_ci",
+        "PLATFORM_REPO_PATHS=/opt/xilinx/platforms"
+    ]){
+        parallel firstBranch: {
+            stage('Brevitas export') {
+                dir("${env.WORKSPACE}") {
+                sh("bash run-docker.sh pytest -mbrevitas_export")
+                }
+            }
+        }, secondBranch: {
+            stage('Streamlining transformations') {
+                dir("${env.WORKSPACE}") {
+                sh("bash run-docker.sh pytest -mstreamline")
+                }
+            }
+        }, thirdBranch: {
+            stage('Util functions') {
+                dir("${env.WORKSPACE}") {
+                sh("bash run-docker.sh pytest -mutil")
+                }
+            }
+        }, fourthBranch: {
+            stage('General transformations') {
+                dir("${env.WORKSPACE}") {
+                sh("bash run-docker.sh pytest -mtransform")
+                }
+            }
+        }, fifthBranch: {
+            stage('Fpgadataflow transformations and simulations') {
+                dir("${env.WORKSPACE}") {
+                sh("bash run-docker.sh pytest -mfpgadataflow")
+                }
+            }
+        }
+    }
+}
diff --git a/docker/jenkins/test_bnn_hw_pytest.py b/docker/jenkins/test_bnn_hw_pytest.py
new file mode 100755
index 0000000000..c8f4fbf74d
--- /dev/null
+++ b/docker/jenkins/test_bnn_hw_pytest.py
@@ -0,0 +1,208 @@
+import pytest
+
+import itertools
+import logging
+import numpy as np
+import os
+import subprocess
+from scipy.stats import linregress
+
+# no __init__ constructors allowed in Pytest - so use global variables instead
+base_dir_global = os.getcwd()
+default_test_run_timeout = 30  # seconds
+output_execute_results_file = "output.npy"
+execute_results_reference_file = "output_reference.npy"
+output_throughput_results_file = "nw_metrics.txt"
+throughput_results_formatted_file = "throughput_metrics_formatted.txt"
+logger = logging.getLogger(__name__)
+
+
+def remove_cache_dirs(dir_list):
+    tmp_list = list(dir_list)
+    for i in range(len(tmp_list) - 1, -1, -1):
+        if ".pytest_cache" in tmp_list[i]:
+            del tmp_list[i]
+        elif "__pycache__" in tmp_list[i]:
+            del tmp_list[i]
+    return tmp_list
+
+
+def delete_file(file_path):
+    # Check if the file exists before deleting it
+    if os.path.exists(file_path):
+        try:
+            os.remove(file_path)
+            logger.info(f"File '{file_path}' deleted successfully.")
+        except Exception as e:
+            logger.error(f"An error occurred while deleting the file: {e}")
+    else:
+        logger.info(f"File '{file_path}' does not exist. Continuing with the script.")
+
+
+def get_platform(board_str):
+    return "alveo" if "U250" in board_str else "zynq-iodma"
+
+
+def get_full_parameterized_test_list(marker, test_dir_list, batch_size_list, platform_list):
+    test_cases = [
+        (
+            f"{marker}_{param1}_batchSize-{param2}_platform-{param3}",
+            {
+                "test_dir": param1,
+                "batch_size": param2,
+                "platform": param3,
+            },
+        )
+        for param1, param2, param3 in itertools.product(
+            test_dir_list,
+            batch_size_list,
+            platform_list,
+        )
+    ]
+    return test_cases
+
+
+def pytest_generate_tests(metafunc):
+    idlist = []
+    argvalues = []
+    scenarios = []
+
+    # Separate the full list of markers used on command line.
+    # This allows a user to select multiple markers
+    all_markers_used = metafunc.config.getoption("-m").split(" ")
+    current_dir = os.getcwd()
+    test_dirs = [
+        name for name in os.listdir(current_dir) if os.path.isdir(os.path.join(current_dir, name))
+    ]
+    test_dirs = remove_cache_dirs(test_dirs)
+
+    for marker in all_markers_used:
+        if "Pynq" in marker or "U250" in marker or "ZCU104" in marker or "KV260_SOM" in marker:
+            platform = get_platform(marker)
+            scenarios.extend(
+                get_full_parameterized_test_list(
+                    marker, test_dir_list=test_dirs, batch_size_list=[1], platform_list=[platform]
+                )
+            )
+
+    if len(scenarios) > 0:
+        for scenario in scenarios:
+            idlist.append(scenario[0])
+            items = scenario[1].items()
+            argnames = [x[0] for x in items]
+            argvalues.append([x[1] for x in items])
+        metafunc.parametrize(argnames, argvalues, ids=idlist, scope="class")
+
+
+@pytest.mark.Pynq
+@pytest.mark.U250
+@pytest.mark.ZCU104
+@pytest.mark.KV260_SOM
+class TestBnn:
+    def test_type_execute(self, test_dir, batch_size, platform):
+        # Enter into test directory and clean any files from a potential previous run
+        os.chdir(os.path.join(base_dir_global, test_dir))
+        delete_file(output_execute_results_file)
+
+        # Run test option: execute
+        bitfile = "a.xclbin" if platform == "alveo" else "resizer.bit"
+        result = subprocess.run(
+            [
+                "python",
+                "driver.py",
+                "--exec_mode=execute",
+                f"--batchsize={batch_size}",
+                f"--bitfile={bitfile}",
+                "--inputfile=input.npy",
+                "--outputfile=output.npy",
+                f"--platform={platform}",
+            ],
+            capture_output=True,
+            text=True,
+            timeout=default_test_run_timeout,
+        )
+        assert result.returncode == 0
+
+        # Load the output and reference arrays
+        output_array = np.load(output_execute_results_file)
+        reference_array = np.load(execute_results_reference_file)
+
+        # Compare the arrays
+        try:
+            assert np.isclose(output_array, reference_array).all()
+        except AssertionError as e:
+            logger.error("AssertionError occurred: %s", e, exc_info=True)
+            raise
+
+    def test_type_throughput(self, test_dir, batch_size, platform):
+        os.chdir(os.path.join(base_dir_global, test_dir))
+        delete_file(output_throughput_results_file)
+
+        # Run test option: throughput
+        bitfile = "a.xclbin" if platform == "alveo" else "resizer.bit"
+        result = subprocess.run(
+            [
+                "python",
+                "driver.py",
+                "--exec_mode=throughput_test",
+                f"--batchsize={batch_size}",
+                f"--bitfile={bitfile}",
+                "--inputfile=input.npy",
+                "--outputfile=output.npy",
+                f"--platform={platform}",
+            ],
+            capture_output=True,
+            text=True,
+            timeout=default_test_run_timeout,
+        )
+        assert result.returncode == 0
+
+        # Check if nw_metrics.txt now exists after test run
+        assert os.path.exists(output_throughput_results_file)
+
+        with open(output_throughput_results_file, "r") as file:
+            res = eval(file.read())
+
+        # try a range of batch sizes, some may fail due to insufficient DMA
+        # buffers
+        bsize_range_in = [8**i for i in range(5)]
+        bsize_range = []
+        ret = dict()
+        for bsize in bsize_range_in:
+            if res is not None:
+                ret[bsize] = res
+                bsize_range.append(bsize)
+            else:
+                # assume we reached largest possible N
+                break
+
+        y = [ret[key]["runtime[ms]"] for key in bsize_range]
+        lrret = linregress(bsize_range, y)
+        ret_str = ""
+        ret_str += "\n" + "%s Throughput Test Results" % test_dir
+        ret_str += "\n" + "-----------------------------"
+        ret_str += "\n" + "From linear regression:"
+        ret_str += "\n" + "Invocation overhead: %f ms" % lrret.intercept
+        ret_str += "\n" + "Time per sample: %f ms" % lrret.slope
+        ret_str += "\n" + "Raw data:"
+
+        ret_str += "\n" + "{:<8} {:<16} {:<16} {:<16} {:<16} {:<16}".format(
+            "N", "runtime[ms]", "fclk[mhz]", "fps", "DRAM rd[MB/s]", "DRAM wr[MB/s]"
+        )
+        for k in bsize_range:
+            v = ret[k]
+            ret_str += "\n" + "{:<8} {:<16} {:<16} {:<16} {:<16} {:<16}".format(
+                k,
+                np.round(v["runtime[ms]"], 4),
+                v["fclk[mhz]"],
+                np.round(v["throughput[images/s]"], 2),
+                np.round(v["DRAM_in_bandwidth[MB/s]"], 2),
+                np.round(v["DRAM_out_bandwidth[MB/s]"], 2),
+            )
+        ret_str += "\n" + "-----------------------------"
+        # largest_bsize = bsize_range[-1]
+
+        # Dump the metrics to a text file
+        with open(throughput_results_formatted_file, "w") as f:
+            f.write(ret_str)
+        assert os.path.exists(throughput_results_formatted_file)
diff --git a/docker/quicktest.sh b/docker/quicktest.sh
index f625f2b1ef..3684e3a0d4 100755
--- a/docker/quicktest.sh
+++ b/docker/quicktest.sh
@@ -2,20 +2,20 @@
 
 : ${PYTEST_PARALLEL=auto}
 
-cd $FINN_ROOT/finn
+cd $FINN_ROOT
 # check if command line argument is empty or not present
 if [ -z $1 ]; then
   echo "Running quicktest: not (vivado or slow or board) with pytest-xdist"
-  python setup.py test --addopts "-m 'not (vivado or slow or vitis or board)' --dist=loadfile -n $PYTEST_PARALLEL"
+  pytest -m 'not (vivado or slow or vitis or board or notebooks or bnn_pynq)' --dist=loadfile -n $PYTEST_PARALLEL
 elif [ $1 = "main" ]; then
   echo "Running main test suite: not (rtlsim or end2end) with pytest-xdist"
-  python setup.py test --addopts "-k 'not (rtlsim or end2end)' --dist=loadfile -n $PYTEST_PARALLEL"
+  pytest -k 'not (rtlsim or end2end)' --dist=loadfile -n $PYTEST_PARALLEL
 elif [ $1 = "rtlsim" ]; then
   echo "Running rtlsim test suite with pytest-parallel"
-  python setup.py test --addopts "-k rtlsim --workers $PYTEST_PARALLEL"
+  pytest -k rtlsim --workers $PYTEST_PARALLEL
 elif [ $1 = "end2end" ]; then
   echo "Running end2end test suite with no parallelism"
-  python setup.py test --addopts "-k end2end"
+  pytest -k end2end
 elif [ $1 = "full" ]; then
   echo "Running full test suite, each step with appropriate parallelism"
   $0 main;
diff --git a/docs/finn/brevitas_export.rst b/docs/finn/brevitas_export.rst
index 304aa30854..950b601f98 100644
--- a/docs/finn/brevitas_export.rst
+++ b/docs/finn/brevitas_export.rst
@@ -16,6 +16,6 @@ Two of the Brevitas-exported ONNX variants can be ingested by FINN:
 
 To work with either type of ONNX model, it is loaded into a :ref:`modelwrapper` provided by FINN.
 
-At this stage we can already use the functional verification flow to simulate the model using Python, this is marked in the graphic with the dotted arrow. For more details please have look at :ref:`verification`.
+At this stage we can already use the functional verification flow to simulate the model using Python. For more details please have look at :ref:`verification`.
 
 The model can now be further processed in FINN, the next flow step is :ref:`nw_prep`.
diff --git a/docs/finn/command_line.rst b/docs/finn/command_line.rst
index 12e01db554..8c37479a28 100644
--- a/docs/finn/command_line.rst
+++ b/docs/finn/command_line.rst
@@ -105,7 +105,7 @@ The following outputs will be generated regardless of which particular outputs a
 The other output products are controlled by the `generate_outputs` field in the
 build configuration), and are detailed below.
 
-* :py:mod:`finn.builder.build_dataflow.DataflowOutputType.ESTIMATE_REPORTS` produces a variety of reports to estimate resource usage and performance *without* running any synthesis. This can be useful for setting up the parallelization and other hardware configuration:
+* :py:mod:`finn.builder.build_dataflow_config.DataflowOutputType.ESTIMATE_REPORTS` produces a variety of reports to estimate resource usage and performance *without* running any synthesis. This can be useful for setting up the parallelization and other hardware configuration:
 
   * ``report/estimate_layer_cycles.json`` -- cycles per layer estimation from analytical model
   * ``report/estimate_layer_resources.json`` -- resources per layer estimation from analytical model
@@ -113,31 +113,31 @@ build configuration), and are detailed below.
   * ``report/estimate_network_performance.json`` -- whole-network performance estimation from analytical model
   * ``report/op_and_param_counts.json`` -- per-layer and total number of operations and parameters (independent of parallelization)
 
-* :py:mod:`finn.builder.build_dataflow.DataflowOutputType.STITCHED_IP`: produces a stitched Vivado IP block design that can be integrated with other FPGA designs in Vivado IPI:
+* :py:mod:`finn.builder.build_dataflow_config.DataflowOutputType.STITCHED_IP`: produces a stitched Vivado IP block design that can be integrated with other FPGA designs in Vivado IPI:
 
   * ``stitched_ip/finn_vivado_stitch_proj.xpr`` -- Vivado project (including Vivado IP Integrator block design) to generate the stitched IP
   * ``stitched_ip/ip`` -- exported Vivado IP for the stitched design
 
-* :py:mod:`finn.builder.build_dataflow.DataflowOutputType.RTLSIM_PERFORMANCE`: measure latency and performance for the stitched IP in RTL simulation, using PyVerilator
+* :py:mod:`finn.builder.build_dataflow_config.DataflowOutputType.RTLSIM_PERFORMANCE`: measure latency and performance for the stitched IP in RTL simulation, using PyVerilator
 
   * ``report/rtlsim_performance.json`` -- accelerator throughput and latency from RTL simulation
 
-* :py:mod:`finn.builder.build_dataflow.DataflowOutputType.OOC_SYNTH` runs out-of-context synthesis for the stitched IP. This is useful for getting post-synthesis resource counts and achievable clock frequency without having to produce a full bitfile with DMA engines:
+* :py:mod:`finn.builder.build_dataflow_config.DataflowOutputType.OOC_SYNTH` runs out-of-context synthesis for the stitched IP. This is useful for getting post-synthesis resource counts and achievable clock frequency without having to produce a full bitfile with DMA engines:
 
   * ``report/ooc_synth_and_timing.json`` -- resources and achievable clock frequency from out-of-context synthesis
 
-* :py:mod:`finn.builder.build_dataflow.DataflowOutputType.BITFILE` will run Vivado and/or Vitis to insert the FINN accelerator inside a shell, with DMA engines instantiated to move data to/from main memory:
+* :py:mod:`finn.builder.build_dataflow_config.DataflowOutputType.BITFILE` will run Vivado and/or Vitis to insert the FINN accelerator inside a shell, with DMA engines instantiated to move data to/from main memory:
 
   * ``bitfile/finn-accel.(bit|xclbin)`` -- generated bitfile depending on platform
   * ``report/post_synth_resources.xml`` -- FPGA resource utilization after synthesis
   * ``report/post_route_timing.rpt`` -- post-route timing report
 
 
-* :py:mod:`finn.builder.build_dataflow.DataflowOutputType.PYNQ_DRIVER` will generate a PYNQ Python driver that can be used to interface the generated accelerator:
+* :py:mod:`finn.builder.build_dataflow_config.DataflowOutputType.PYNQ_DRIVER` will generate a PYNQ Python driver that can be used to interface the generated accelerator:
 
   * ``driver/driver.py`` -- Python driver that can be used on PYNQ on Zynq or Alveo platforms to launch the accelerator
 
-* :py:mod:`finn.builder.build_dataflow.DataflowOutputType.DEPLOYMENT_PACKAGE`:
+* :py:mod:`finn.builder.build_dataflow_config.DataflowOutputType.DEPLOYMENT_PACKAGE`:
 
   * ``deploy/`` -- deployment package folder with a bitfile and driver, ready to be copied to target hardware platform
 
@@ -153,7 +153,7 @@ and compare it against the expected output that you provide.
 
 This is achieved by setting up the following members of the build configuration:
 
-* Set ``verify_steps`` to be a list of :py:mod:`finn.builder.build_dataflow.VerificationStepType`
+* Set ``verify_steps`` to be a list of :py:mod:`finn.builder.build_dataflow_config.VerificationStepType`
   where each element in the list indicates the output of a particular step
   that will be verified. See the documentation of the ``VerificationStepType``
   for more information.
diff --git a/docs/finn/developers.rst b/docs/finn/developers.rst
index b152dfef66..1e1c48e2b5 100644
--- a/docs/finn/developers.rst
+++ b/docs/finn/developers.rst
@@ -12,7 +12,7 @@ Prerequisites
 
 Before starting to do development on FINN it's a good idea to start
 with understanding the basics as a user. Going through all of the
-:ref:`tutorials` is strongly recommended if you haven' already done so.
+:ref:`tutorials` is strongly recommended if you haven't already done so.
 Additionally, please review the documentation available on :ref:`internals`.
 
 Repository structure
@@ -153,14 +153,14 @@ from the FINN root directory as follows:
 
 ::
 
-  python setup.py test --addopts "-k test_brevitas_debug --pdb"
+  pytest -k test_brevitas_debug --pdb
 
 
 If you want to run tests in parallel (e.g. to take advantage of a multi-core CPU)
 you can use:
 
-* pytest-parallel for any rtlsim tests, e.g. `python setup.py test --addopts "-k rtlsim --workers auto"`
-* pytest-xdist for anything else, make sure to add `--dist=loadfile` if you have tests in the same file that have dependencies on each other e.g. `python setup.py test --addopts "-k mytest -n auto --dist=loadfile"`
+* pytest-parallel for any rtlsim tests, e.g. `pytest -k rtlsim --workers auto`
+* pytest-xdist for anything else, make sure to add `--dist=loadfile` if you have tests in the same file that have dependencies on each other e.g. `pytest -k mytest -n auto --dist=loadfile`
 
 Finally, the full test suite with appropriate parallelization can be run inside the container by:
 
diff --git a/docs/finn/end_to_end_flow.rst b/docs/finn/end_to_end_flow.rst
index bc5c523071..0a022067c3 100644
--- a/docs/finn/end_to_end_flow.rst
+++ b/docs/finn/end_to_end_flow.rst
@@ -9,7 +9,7 @@ As you can see in the picture, FINN has a high modularity and has the property t
    :scale: 50%
    :align: center
 
-The white fields show the state of the network representation in the respective step. The colored fields represent the transformations that are applied to the network to achieve a certain result. The diagram is divided into five sections, each of it includes several flow steps. The flow starts in top left corner with Brevitas export (green section), followed by the preparation of the network (blue section) for the Vivado HLS and Vivado IPI (orange section). There is also a section for testing and verification in software (red section) and the hardware generation and deployment on the PYNQ board (yellow section).
+The white fields show the state of the network representation in the respective step. The colored fields represent the transformations that are applied to the network to achieve a certain result. The diagram is divided into five sections, each of it includes several flow steps. The flow starts in top left corner with Brevitas export, followed by the preparation of the network for the Vitis HLS and Vivado IPI. There is also a section for testing and verification in software (in the cloud on the right) and the hardware generation and deployment on the PYNQ board.
 
 This example flow is covered in the `end2end_example <https://github.com/Xilinx/finn/tree/main/notebooks/end2end_example>`_ Jupyter notebooks.
 For a more detailed overview about the different flow sections, please have a look at the corresponding pages:
diff --git a/docs/finn/getting_started.rst b/docs/finn/getting_started.rst
index 40425c119f..c575ca7e3b 100644
--- a/docs/finn/getting_started.rst
+++ b/docs/finn/getting_started.rst
@@ -20,7 +20,7 @@ How do I use FINN?
 ==================
 
 We strongly recommend that you first watch one of the pre-recorded `FINN tutorial <https://www.youtube.com/watch?v=zw2aG4PhzmA&amp%3Bindex=2>`_
-videos, then follow the Jupyter notebook tutorials for `training and deploying an MLP for network intrusion detection <https://github.com/Xilinx/finn/tree/master/notebooks/end2end_example/cybersecurity>`_ .
+videos, then follow the Jupyter notebook tutorials for `training and deploying an MLP for network intrusion detection <https://github.com/Xilinx/finn/tree/main/notebooks/end2end_example/cybersecurity>`_ .
 You may also want to check out the other :ref:`tutorials`, and the `FINN examples repository <https://github.com/Xilinx/finn-examples>`_ .
 
 Our aim in FINN is *not* to accelerate common off-the-shelf neural networks, but instead provide you with a set of tools
@@ -28,19 +28,19 @@ to train *customized* networks and create highly-efficient FPGA implementations
 In general, the approach for using the FINN framework is as follows:
 
 1. Train your own quantized neural network (QNN) in `Brevitas <https://github.com/Xilinx/brevitas>`_. We have some `guidelines <https://bit.ly/finn-hls4ml-qat-guidelines>`_ on quantization-aware training (QAT).
-2. Export to FINN-ONNX by following `this tutorial <https://github.com/Xilinx/finn/blob/master/notebooks/basics/1_brevitas_network_import.ipynb>`_ .
-3. Use FINN's ``build_dataflow`` system on the exported model by following this `tutorial <https://github.com/Xilinx/finn/blob/master/notebooks/end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb>`_
+2. Export to FINN-ONNX by following `this tutorial <https://github.com/Xilinx/finn/blob/main/notebooks/basics/1_brevitas_network_import.ipynb>`_ .
+3. Use FINN's ``build_dataflow`` system on the exported model by following this `tutorial <https://github.com/Xilinx/finn/blob/main/notebooks/end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb>`_
 4. Adjust your QNN topology, quantization settings and ``build_dataflow`` configuration to get the desired results.
 
 Please note that the framework is still under development, and how well this works will depend on how similar your custom network is to the examples we provide.
 If there are substantial differences, you will most likely have to write your own
 Python scripts that call the appropriate FINN compiler
 functions that process your design correctly, or adding new functions (including
-Vivado HLS layers)
+Vitis HLS layers)
 as required.
-The `advanced FINN tutorials <https://github.com/Xilinx/finn/tree/master/notebooks/advanced>`_ can be useful here.
+The `advanced FINN tutorials <https://github.com/Xilinx/finn/tree/main/notebooks/advanced>`_ can be useful here.
 For custom networks, we recommend making a copy of the `BNN-PYNQ end-to-end
-Jupyter notebook tutorials <https://github.com/Xilinx/finn/tree/master/notebooks/end2end_example/bnn-pynq>`_ as a starting point, visualizing the model at intermediate
+Jupyter notebook tutorials <https://github.com/Xilinx/finn/tree/main/notebooks/end2end_example/bnn-pynq>`_ as a starting point, visualizing the model at intermediate
 steps and adding calls to new transformations as needed.
 Once you have a working flow, you can implement a command line entry for this
 by using the "advanced mode" described in the :ref:`command_line` section.
@@ -50,7 +50,8 @@ Running FINN in Docker
 FINN runs inside a Docker container, it comes with a script to easily build and launch the container. If you are not familiar with Docker, there are many excellent `online resources <https://docker-curriculum.com/>`_ to get started.
 You may want to review the :ref:`General FINN Docker tips` and :ref:`Environment variables` as well.
 If you want to use prebuilt images, read :ref:`Using a prebuilt image`.
-The ``run-docker.sh`` script that can be launched in the following modes:
+
+The above mentioned script to build and launch the FINN docker container is called `run-docker.sh <https://github.com/Xilinx/finn/blob/main/run-docker.sh>`_ . It can be launched in the following modes:
 
 Launch interactive shell
 ************************
@@ -106,9 +107,6 @@ These are summarized below:
 * (optional) ``LOCALHOST_URL`` (default localhost) sets the base URL for accessing e.g. Netron from inside the container. Useful when running FINN remotely.
 * (optional) ``NETRON_PORT`` (default 8081) changes the port for Netron inside Docker
 * (optional) ``PYNQ_BOARD`` or ``ALVEO_BOARD`` specifies the type of PYNQ/Alveo board used (see "supported hardware" below) for the test suite
-* (optional) ``PYNQ_IP`` and ``PYNQ_PORT`` (or ``ALVEO_IP`` and ``ALVEO_PORT``) specify ip address and port number to access the PYNQ board / Alveo target
-* (optional) ``PYNQ_USERNAME`` and ``PYNQ_PASSWORD`` (or ``ALVEO_USERNAME`` and ``ALVEO_PASSWORD``) specify the PYNQ board / Alveo host access credentials for the test suite. For PYNQ, password is always needed to run as sudo. For Alveo, you can leave the password empty and place your ssh private key in the ``finn/ssh_keys`` folder to use keypair authentication.
-* (optional) ``PYNQ_TARGET_DIR`` (or ``ALVEO_TARGET_DIR``) specifies the target dir on the PYNQ board / Alveo host for the test suite
 * (optional) ``IMAGENET_VAL_PATH`` specifies the path to the ImageNet validation directory for tests.
 * (optional) ``FINN_DOCKER_PREBUILT`` (default 0) if set to 1 then skip Docker image building and use the image tagged with ``FINN_DOCKER_TAG``.
 * (optional) ``FINN_DOCKER_TAG`` (autogenerated) specifies the Docker image tag to use.
@@ -140,10 +138,7 @@ If you are having trouble building the Docker image or need offline access, you
 
 Supported FPGA Hardware
 =======================
-**Shell-integrated accelerator + driver:** For quick deployment, we target boards supported by  `PYNQ <http://www.pynq.io/>`_ . For these platforms, we can build a full bitfile including DMAs to move data into and out of the FINN-generated accelerator, as well as a Python driver to launch the accelerator. We support the Pynq-Z1, Pynq-Z2, Ultra96, ZCU102 and ZCU104 boards.
-
-.. warning::
-  In previous FINN versions (v0.4b - v0.7) we had support for `Xilinx Alveo boards <https://www.xilinx.com/products/boards-and-kits/alveo.html>`_ using PYNQ and Vitis 2020.1, see instructions below for Alveo setup that works with older versions. Please note that with the new release with Vitis 2022.1, we do only have experimental support to automatically deployment for Alveo cards.
+**Shell-integrated accelerator + driver:** For quick deployment, we target boards supported by  `PYNQ <http://www.pynq.io/>`_ . For these platforms, we can build a full bitfile including DMAs to move data into and out of the FINN-generated accelerator, as well as a Python driver to launch the accelerator. We support the Pynq-Z1, Pynq-Z2, Ultra96, ZCU102 and ZCU104 boards, as well as Alveo cards.
 
 **Vivado IPI support for any Xilinx FPGA:** FINN generates a Vivado IP Integrator (IPI) design from the neural network with AXI stream (FIFO) in-out interfaces, which can be integrated onto any Xilinx FPGA as part of a larger system. It's up to you to take the FINN-generated accelerator (what we call "stitched IP" in the tutorials), wire it up to your FPGA design and send/receive neural network data to/from the accelerator.
 
@@ -181,12 +176,12 @@ On the target side:
 
 On the host side:
 
-1. Install Vitis 2020.1 and set up the ``VITIS_PATH`` environment variable to point to your installation.
+1. Install Vitis 2022.1 and set up the ``VITIS_PATH`` environment variable to point to your installation.
 2. Install Xilinx XRT. Ensure that the ``XRT_DEB_VERSION`` environment variable reflects which version of XRT you have installed.
 3. Install the Vitis platform files for Alveo and set up the ``PLATFORM_REPO_PATHS`` environment variable to point to your installation. *This must be the same path as the target's platform files (target step 2)*
 4. Set up the ``ALVEO_*`` environment variables accordingly for your target, see description of environment variables above.
 5. `Set up public key authentication <https://www.digitalocean.com/community/tutorials/how-to-configure-ssh-key-based-authentication-on-a-linux-server>`_. Copy your private key to the ``finn/ssh_keys`` folder on the host to get password-less deployment and remote execution.
-6. Done! You can try the ``test_end2end_vitis`` tests in the FINN Docker to verify your setup, although this will take some time.
+6. Done!
 
 Vivado/Vitis license
 *********************
@@ -214,7 +209,7 @@ We also recommend running the FINN compiler on a system with sufficiently
 strong hardware:
 
 * **RAM.** Depending on your target FPGA platform, your system must have sufficient RAM to be
-  able to run Vivado/Vitis synthesis for that part. See `this page <https://www.xilinx.com/products/design-tools/vivado/memory.html>`_
+  able to run Vivado/Vitis synthesis for that part. See `this page <https://www.xilinx.com/products/design-tools/vivado/vivado-ml.html#memory>`_
   for more information. For targeting Zynq and Zynq UltraScale+ parts, at least 8 GB is recommended. Larger parts may require up to 16 GB.
   For targeting Alveo parts with Vitis, at least 64 GB RAM is recommended.
 
diff --git a/docs/finn/hw_build.rst b/docs/finn/hw_build.rst
index 2a64b87943..a5c486935d 100644
--- a/docs/finn/hw_build.rst
+++ b/docs/finn/hw_build.rst
@@ -9,14 +9,14 @@ Hardware Build and Deployment
    :align: center
 
 A model where all layers have been converted to HLS layers can be processed by
-FINN to build a bitfile and driver targeting a Zynq system or to generate a Vivado IP Integrator (IPI)
+FINN to build a bitfile and driver targeting a Zynq or Alveo system or to generate a Vivado IP Integrator (IPI)
 design with AXI stream (FIFO) in-out interfaces, which can be integrated onto any Xilinx FPGA as part of a larger system.
 
 
 Hardware Build
 ==============
 
-Internally, the hardware build for Zynq devices consists of the following steps:
+Internally, the hardware build consists of the following steps:
 
 1. Driver generation
 2. DMA and DWC node insertion
@@ -89,9 +89,4 @@ Deployment
 Deployment and Remote Execution
 -------------------------------
 
-The bitfile and the driver file(s) are copied to the PYNQ board and can be executed there using the *onnx_exec* function with the right *exec_mode* settings. For details please have a look at transformation :py:mod:`finn.transformation.fpgadataflow.make_deployment.DeployToPYNQ` and the execution function :py:mod:`finn.core.onnx_exec`.
-
-Throughput Test
----------------
-
-FINN also offers the possibility to measure the network performance directly on the PYNQ board. This can be done by using :py:mod:`finn.core.throughput_test`. When running this function the metrics of the network are returned as dictionary.
+The bitfile and the driver file(s) are copied to the PYNQ board and can be executed there. For more information see the description in the `end2end_example <https://github.com/Xilinx/finn/tree/main/notebooks/end2end_example>`_ Jupyter notebooks.
diff --git a/docs/finn/img/rtl_swg_impl_styles.png b/docs/finn/img/rtl_swg_impl_styles.png
new file mode 100644
index 0000000000..265ff9b915
Binary files /dev/null and b/docs/finn/img/rtl_swg_impl_styles.png differ
diff --git a/docs/finn/internals.rst b/docs/finn/internals.rst
index 0b33affc76..652c94ac24 100644
--- a/docs/finn/internals.rst
+++ b/docs/finn/internals.rst
@@ -7,7 +7,7 @@ Internals
 Intermediate Representation: QONNX and FINN-ONNX
 ================================================
 
-FINN uses `ONNX <https://github.com/onnx/onnx>`_ as an intermediate representation (IR) for neural networks. As such, almost every component inside FINN uses ONNX and its `Python API <https://github.com/onnx/onnx/blob/master/docs/PythonAPIOverview.md>`_, so you may want to familiarize yourself with how ONNX represents DNNs. Specifically, the `ONNX protobuf description <https://github.com/onnx/onnx/blob/master/onnx/onnx.proto>`_ (or its `human-readable documentation <https://github.com/onnx/onnx/blob/master/docs/IR.md>`_ and the `operator schemas <https://github.com/onnx/onnx/blob/master/docs/Operators.md>`_ are useful as reference documents. We also provide a Jupyter notebook that can help to get familiar with ONNX by showing how to work with a simple ONNX model in FINN, see chapter :ref:`tutorials` for details.
+FINN uses `ONNX <https://github.com/onnx/onnx>`_ as an intermediate representation (IR) for neural networks. As such, almost every component inside FINN uses ONNX and its `Python API <https://github.com/onnx/onnx/blob/main/docs/PythonAPIOverview.md>`_, so you may want to familiarize yourself with how ONNX represents DNNs. Specifically, the `ONNX protobuf description <https://github.com/onnx/onnx/blob/main/onnx/onnx.proto>`_ (or its `human-readable documentation <https://github.com/onnx/onnx/blob/main/docs/IR.md>`_ and the `operator schemas <https://github.com/onnx/onnx/blob/main/docs/Operators.md>`_ are useful as reference documents. We also provide a Jupyter notebook that can help to get familiar with ONNX by showing how to work with a simple ONNX model in FINN, see chapter :ref:`tutorials` for details.
 
 .. note:: FINN supports two specialized variants of ONNX called QONNX and FINN-ONNX, and not all ONNX graphs are supported by FINN (and vice versa).
 
@@ -137,14 +137,14 @@ ModelWrapper contains more useful functions, if you are interested please have a
 Analysis Pass
 =============
 
-An analysis pass traverses the graph structure and produces information about certain properties. It gets the model in the ModelWrapper as input and returns a dictionary of the properties the analysis extracts. If you are interested in how to write an analysis pass for FINN, please take a look at the Jupyter notebook about how to write an analysis pass, see chapter :ref:`tutorials` for details. For more information about existing analysis passes in FINN, see module :py:mod:`finn.analysis`.
+An analysis pass traverses the graph structure and produces information about certain properties. It gets the model in the ModelWrapper as input and returns a dictionary of the properties the analysis extracts. If you are interested in how to write an analysis pass for FINN, please take a look at the Jupyter notebook about how to write an analysis pass, see chapter :ref:`tutorials` for details. For more information about existing analysis passes in FINN, see module :py:mod:`finn.analysis` .
 
 .. _transformation_pass:
 
 Transformation Pass
 ===================
 
-A transformation passes changes (transforms) the given model, it gets the model in the ModelWrapper as input and returns the changed model (ModelWrapper) to the FINN flow. Additional the flag *model_was_changed* which indicates if a transformation has to be performed more than once, is returned. If you are interested in how to write a transformation pass for FINN, please take a look at the Jupyter notebook about how to write a transformation pass, see chapter :ref:`tutorials` for details. For more information about existing transformation passes in FINN, see module :py:mod:`finn.transformation`.
+A transformation passes changes (transforms) the given model, it gets the model in the ModelWrapper as input and returns the changed model (ModelWrapper) to the FINN flow. Additional the flag *model_was_changed* which indicates if a transformation has to be performed more than once, is returned. If you are interested in how to write a transformation pass for FINN, please take a look at the Jupyter notebook about how to write a transformation pass, see chapter :ref:`tutorials` for details. For more information about existing transformation passes in FINN, see module :py:mod:`finn.transformation` .
 
 .. _mem_mode:
 
@@ -167,7 +167,7 @@ The following picture shows the idea behind the "const" and "decoupled" mode.
 
 Const mode
 ----------
-In *const* mode the weights are "baked in" into the Matrix-Vector-Activate-Unit (MVAU), which means they are part of the HLS code. During the IP block generation the weight values are integrated as *params.h* file in the HLS code and synthesized together with it. For the *const* mode IP block generation the `Matrix_Vector_Activate_Batch function <https://github.com/Xilinx/finn-hlslib/blob/19fa1197c09bca24a0f77a7fa04b8d7cb5cc1c1d/mvau.hpp#L93>`_ from the finn-hls library is used, which implements a standard MVAU. The resulting IP block has an input and an output stream, as shown in the above picture on the left. FIFOs in the form of verilog components are connected to these.
+In *const* mode the weights are "baked in" into the Matrix-Vector-Activate-Unit (MVAU), which means they are part of the HLS code. During the IP block generation the weight values are integrated as *params.h* file in the HLS code and synthesized together with it. For the *const* mode IP block generation the `Matrix_Vector_Activate_Batch function <https://github.com/Xilinx/finn-hlslib/blob/master/mvau.hpp#L92>`_ from the finn-hls library is used, which implements a standard MVAU. The resulting IP block has an input and an output stream, as shown in the above picture on the left. FIFOs in the form of verilog components are connected to these.
 
 Advantages:
 
@@ -185,7 +185,7 @@ Disadvantages:
 
 Decoupled mode
 --------------
-In *decoupled* mode a different variant of the MVAU with three ports is used. Besides the input and output streams, which are fed into the circuit via Verilog FIFOs, there is another input, which is used to stream the weights. For this the `streaming MVAU <https://github.com/Xilinx/finn-hlslib/blob/07a8353f6cdfd8bcdd81e309a5581044c2a93d3b/mvau.hpp#L213>`_ from the finn-hls library is used. To make the streaming possible a Verilog weight streamer component accesses the weight memory and sends the values via another FIFO to the MVAU. This component can be found in the `finn-rtllib <https://github.com/Xilinx/finn/tree/dev/finn-rtllib>`_ under the name *memstream.v*. For the IP block generation this component, the IP block resulting from the synthesis of the HLS code of the streaming MVAU and a FIFO for the weight stream are combined in a verilog wrapper. The weight values are saved in .dat files and stored in the weight memory from which the weight streamer reads. The resulting verilog component, which is named after the name of the node and has the suffix "_memstream.v", exposes only two ports to the outside, the data input and output. It therefore behaves externally in the same way as the MVAU in *const* mode.
+In *decoupled* mode a different variant of the MVAU with three ports is used. Besides the input and output streams, which are fed into the circuit via Verilog FIFOs, there is another input, which is used to stream the weights. For this the `streaming MVAU <https://github.com/Xilinx/finn-hlslib/blob/master/mvau.hpp#L214>`_ from the finn-hls library is used. To make the streaming possible a Verilog weight streamer component accesses the weight memory and sends the values via another FIFO to the MVAU. This component can be found in the `finn-rtllib <https://github.com/Xilinx/finn/tree/dev/finn-rtllib>`_ under the name *memstream.v*. For the IP block generation this component, the IP block resulting from the synthesis of the HLS code of the streaming MVAU and a FIFO for the weight stream are combined in a verilog wrapper. The weight values are saved in .dat files and stored in the weight memory from which the weight streamer reads. The resulting verilog component, which is named after the name of the node and has the suffix "_memstream.v", exposes only two ports to the outside, the data input and output. It therefore behaves externally in the same way as the MVAU in *const* mode.
 
 Advantages:
 
@@ -205,3 +205,142 @@ Disadvantages:
 How to set *mem_mode*
 ---------------------
 When the nodes in the network are converted to HLS layers, the *mem_mode* can be passed. More detailed information about the transformations that prepare the network and the transformation that performs the conversion to HLS layers can be found in chapter :ref:`nw_prep`. The *mem_mode* is passed as argument. Note that if no argument is passed, the default is *const*.
+
+
+.. _folding_factors:
+
+Constraints to folding factors per layer
+=========================================
+
+.. list-table:: Folding factor constraints
+
+   * - **Layers**
+     - **Parameters**
+     - **Constraints**
+   * - Addstreams_Batch
+     - PE
+     - inp_channels % PE == 0
+   * - ChannelwiseOp_Batch
+     - PE
+     - channels % PE == 0
+   * - ConvolutionInputGenerator
+     - SIMD
+     - inp_channels % SIMD == 0
+   * - ConvolutionInputGenerator1d
+     - SIMD
+     - inp_channels % SIMD == 0
+   * - Downsampler
+     - SIMD
+     - inp_channels % SIMD == 0
+   * - DuplicateStreams_Batch
+     - PE
+     - channels % PE == 0
+   * - Eltwise
+     - PE
+     - inp_channels % PE == 0
+   * - FMPadding_batch
+     - SIMD
+     - inp_channels % SIMD == 0
+   * - FMPadding_rtl
+     - SIMD
+     - inp_channels % SIMD == 0
+   * - Globalaccpool_Batch
+     - PE
+     - channels % PE == 0
+   * - Labelselect_Batch
+     - PE
+     - num_labels % PE == 0
+   * - MatrixVectorActivation
+     - PE & SIMD
+     - MH % PE == 0 & MW % SIMD == 0
+   * - Pool_Batch
+     - PE
+     - inp_channels % PE == 0
+   * - Thresholding_Batch
+     - PE
+     - MH % PE == 0
+   * - VectorVectorActivation
+     - PE & SIMD
+     - k_h * k_w % SIMD == 0 & channels % PE == 0
+
+
+RTL ConvolutionInputGenerator
+=============================
+
+FINN implements convolution operations by pairing a ConvolutionInputGenerator (or "sliding window generator (SWG)") with an MVAU or VVAU (for depthwise convolution).
+This RTL version is an alternative to the original `HLS implementation <https://github.com/Xilinx/finn-hlslib/blob/master/slidingwindow.h>`_ and aims to improve on it in the following ways:
+
+* Support a wider range of hyperparameters without the fragmentation into 16+ separate HLS functions
+
+* Support additional degrees of parallelism (i.e., across the output window or multiple input samples) that are difficult to implement in HLS
+
+* Support additional features, such as dynamic feature map sizing
+
+* Improve resource efficiency
+
+
+The component is implemented by generating (System-)Verilog code for each individual instance, realized via the template + replacement dictionary mechanism found in other FINN components.
+Despite the HDL implementation, the component is managed by its own HLSCustomOp (!) named "ConvolutionInputGenerator_rtl". Naturally, HLS simulation & synthesis are not supported.
+
+The RTL SWG is currently disabled by default and can be enabled either in the corresponding HLS conversion transformation (:py:mod:`finn.transformation.fpgadataflow.convert_to_hls_layers.InferConvInpGen`) with `use_rtl_variant=True` or in the build configuration (:py:mod:`finn.builder.build_dataflow_config.DataflowBuildConfig.force_rtl_conv_inp_gen` set to True).
+
+Implementation styles
+---------------------
+Depending on the amount of parallelism requested, one of two implementation styles is selected. The following table defines folding parameters (marked in bold text) and supported configurations.
+
+.. list-table:: Parallelism configurations
+
+   * - **SIMD**
+     - **parallel_window**
+     - **M**
+     - MMV_in
+     - MMV_out
+     - Style
+     - Notes
+   * - < C
+     - 0
+     - 1
+     - 1
+     - 1
+     - default
+     - depthwise-aware
+   * - C
+     - 0
+     - 1
+     - 1
+     - 1
+     - default
+     - depthwise-agnostic
+   * - C
+     - 1
+     - 1
+     - 1
+     - K
+     - parallel
+     - depthwise-agnostic
+   * - C
+     - 1
+     - M
+     - M
+     - M*K
+     - parallel
+     - Currently unsupported
+
+(With C = #Channels, MMV_in = input samples (or "pixels") per cycle, MMV_out = output samples (or "pixels") per cycle, K = kernel_width * kernel_height.)
+
+The following diagram shows the operating principle of both styles, the "parallel" variant is pictured for a 2x2 kernel without dilation.
+
+.. image:: img/rtl_swg_impl_styles.png
+   :align: center
+
+The main difference lies in the buffer structure. If the output width is equal to the input width ("default mode"), an addressable circular buffer is used, which can be implemented either in LUTRAM, BRAM, or URAM resources. If parallel access to multiple window elements is required ("parallel mode"), the SWG generates a fixed structure of registers and line buffers to avoid memory port limitations and exploding multiplexing logic, while still featuring LUT-saving BRAM/URAM implementation for the line buffers.
+
+The "default" style also supports a dynamic mode, which provides an interface to change feature map dimensions, stride, or dilation at run-time. See `this pull request <https://github.com/Xilinx/finn/pull/688>`_ description for more information.
+
+Folding
+-------
+The RTL SWG is supported by the basic automatic folding algorithm in FINN (:py:mod:`finn.transformation.fpgadataflow.set_folding.SetFolding`). Consider the following implications:
+
+**MVAU:** Although it is recommended to unfold SIMD first, SIMD and PE can be set independently. Full (and balanced) parallelism is achieved by using the SWG in parallel window mode and setting MVAU SIMD and PE to their maximum values (SIMD = MW = C_in * K, PE = MH = C_out).
+
+**VVAU:** While the VVAU HLS component supports SIMD unfolding independently from PE, the RTL SWG requires full unfolding across the channel dimension (SIMD of the SWG = PE of the VVAU) before enabling window-parallelism. Unlike the MVAU, the VVAU can't accept datawidth-converted input from a fully-parallel SWG in this case due to the depthwise data layout. As a result, the VVAU should be unfolded by PE first (up to PE = C), followed by SIMD (up to SIMD = K).
diff --git a/docs/finn/nw_prep.rst b/docs/finn/nw_prep.rst
index 566eda5bac..6fea992cf7 100644
--- a/docs/finn/nw_prep.rst
+++ b/docs/finn/nw_prep.rst
@@ -10,7 +10,7 @@ Network Preparation
 
 The main principle of FINN are analysis and transformation passes. If you like to have more information about these please have a look at section :ref:`analysis_pass` and :ref:`transformation_pass` or at chapter :ref:`tutorials` about the provided Jupyter notebooks.
 
-This page is about the network preparation, the flow step that comes after the :ref:`brevitas_export`. Its main idea is to optimize the network and convert the nodes to custom nodes that correspond to `finn-hlslib <https://github.com/Xilinx/finn-hlslib>`_ functions. In this way we get a network that we can bring to hardware with the help of Vivado. For that we have to apply several transformations on the ONNX model, which this flow step receives wrapped in the :ref:`modelwrapper`.
+This page is about the network preparation, the flow step that comes after the :ref:`brevitas_export`. Its main idea is to optimize the network and convert the nodes to custom nodes that correspond to `finn-hlslib <https://github.com/Xilinx/finn-hlslib>`_ functions. In this way we get a network that we can bring to hardware with the help of Vitis and Vivado. For that we have to apply several transformations on the ONNX model, which this flow step receives wrapped in the :ref:`modelwrapper`.
 
 Various transformations are involved in the network preparation. The following is a short overview of these.
 
diff --git a/docs/finn/source_code/finn.analysis.fpgadataflow.rst b/docs/finn/source_code/finn.analysis.fpgadataflow.rst
index b52e994ee6..57472cb670 100644
--- a/docs/finn/source_code/finn.analysis.fpgadataflow.rst
+++ b/docs/finn/source_code/finn.analysis.fpgadataflow.rst
@@ -30,6 +30,7 @@ finn.analysis.fpgadataflow.floorplan\_params
    :undoc-members:
    :show-inheritance:
 
+
 finn.analysis.fpgadataflow.hls\_synth\_res\_estimation
 -------------------------------------------------------------
 
@@ -38,14 +39,15 @@ finn.analysis.fpgadataflow.hls\_synth\_res\_estimation
    :undoc-members:
    :show-inheritance:
 
- finn.analysis.fpgadataflow.op\_and\_param\_counts
- --------------------------------------------------
+finn.analysis.fpgadataflow.op\_and\_param\_counts
+--------------------------------------------------
 
- .. automodule:: finn.analysis.fpgadataflow.op_and_param_counts
+.. automodule:: finn.analysis.fpgadataflow.op_and_param_counts
     :members:
     :undoc-members:
     :show-inheritance:
 
+
 finn.analysis.fpgadataflow.post\_synth\_res
 --------------------------------------------------
 
@@ -54,6 +56,7 @@ finn.analysis.fpgadataflow.post\_synth\_res
    :undoc-members:
    :show-inheritance:
 
+
 finn.analysis.fpgadataflow.res\_estimation
 -------------------------------------------------
 
diff --git a/docs/finn/source_code/finn.builder.rst b/docs/finn/source_code/finn.builder.rst
index 2433cab83d..caadf3f91f 100644
--- a/docs/finn/source_code/finn.builder.rst
+++ b/docs/finn/source_code/finn.builder.rst
@@ -9,9 +9,9 @@ finn.builder.build\_dataflow
 ----------------------------
 
 .. automodule:: finn.builder.build_dataflow
-   :members:
-   :undoc-members:
-   :show-inheritance:
+ :members:
+ :undoc-members:
+ :show-inheritance:
 
 finn.builder.build\_dataflow\_config
 ------------------------------------
@@ -26,6 +26,6 @@ finn.builder.build\_dataflow\_steps
 ------------------------------------
 
 .. automodule:: finn.builder.build_dataflow_steps
-  :members:
-  :undoc-members:
-  :show-inheritance:
+ :members:
+ :undoc-members:
+ :show-inheritance:
diff --git a/docs/finn/source_code/finn.core.rst b/docs/finn/source_code/finn.core.rst
index 4e3de458e1..28cb47eaf7 100644
--- a/docs/finn/source_code/finn.core.rst
+++ b/docs/finn/source_code/finn.core.rst
@@ -37,18 +37,19 @@ qonnx.core.modelwrapper
    :undoc-members:
    :show-inheritance:
 
-finn.core.onnx\_exec
+qonnx.core.onnx\_exec
 ---------------------------
 
-.. automodule:: finn.core.onnx_exec
+.. automodule:: qonnx.core.onnx_exec
    :members:
    :undoc-members:
    :show-inheritance:
 
-finn.core.remote\_exec
------------------------------
 
-.. automodule:: finn.core.remote_exec
+finn.core.onnx\_exec
+---------------------------
+
+.. automodule:: finn.core.onnx_exec
    :members:
    :undoc-members:
    :show-inheritance:
diff --git a/docs/finn/source_code/finn.custom_op.fpgadataflow.rst b/docs/finn/source_code/finn.custom_op.fpgadataflow.rst
index cc56ea603e..fdcf44c6d9 100644
--- a/docs/finn/source_code/finn.custom_op.fpgadataflow.rst
+++ b/docs/finn/source_code/finn.custom_op.fpgadataflow.rst
@@ -8,7 +8,7 @@ HLS Custom Op Nodes
 Base Class
 ----------
 
-.. automodule:: finn.custom_op.fpgadataflow
+.. automodule:: finn.custom_op.fpgadataflow.hlscustomop
    :members:
    :undoc-members:
    :show-inheritance:
@@ -29,9 +29,25 @@ finn.custom\_op.fpgadataflow.channelwise\_op\_batch
    :undoc-members:
    :show-inheritance:
 
+finn.custom\_op.fpgadataflow.checksum
+--------------------------------------
+
+.. automodule:: finn.custom_op.fpgadataflow.checksum
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+finn.custom\_op.fpgadataflow.concat
+-------------------------------------
+
+.. automodule:: finn.custom_op.fpgadataflow.concat
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
 
 finn.custom\_op.fpgadataflow.convolutioninputgenerator
--------------------------------------------------------------
+--------------------------------------------------------
 
 .. automodule:: finn.custom_op.fpgadataflow.convolutioninputgenerator
    :members:
@@ -46,6 +62,15 @@ finn.custom\_op.fpgadataflow.convolutioninputgenerator1d
    :undoc-members:
    :show-inheritance:
 
+
+finn.custom\_op.fpgadataflow.convolutioninputgenerator\_rtl
+------------------------------------------------------------
+
+.. automodule:: finn.custom_op.fpgadataflow.convolutioninputgenerator_rtl
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
 finn.custom\_op.fpgadataflow.downsampler
 -----------------------------------------
 
@@ -62,6 +87,16 @@ finn.custom\_op.fpgadataflow.duplicatestreams\_batch
    :undoc-members:
    :show-inheritance:
 
+
+finn.custom\_op.fpgadataflow.eltwise
+-------------------------------------
+
+.. automodule:: finn.custom_op.fpgadataflow.eltwise
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+
 finn.custom\_op.fpgadataflow.fmpadding\_batch
 -----------------------------------------------
 
@@ -79,7 +114,7 @@ finn.custom\_op.fpgadataflow.globalaccpool\_batch
    :show-inheritance:
 
 finn.custom\_op.fpgadataflow.iodma
------------------------------------------------
+------------------------------------
 
 .. automodule:: finn.custom_op.fpgadataflow.iodma
    :members:
@@ -102,6 +137,15 @@ finn.custom\_op.fpgadataflow.lookup
    :undoc-members:
    :show-inheritance:
 
+finn.custom\_op.fpgadataflow.matrixvectoractivation
+-----------------------------------------------------------
+
+.. automodule:: finn.custom_op.fpgadataflow.matrixvectoractivation
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+
 finn.custom\_op.fpgadataflow.pool\_batch
 -----------------------------------------------
 
@@ -127,14 +171,6 @@ finn.custom\_op.fpgadataflow.streamingdatawidthconverter\_batch
    :undoc-members:
    :show-inheritance:
 
-finn.custom\_op.fpgadataflow.matrixvectoractivation
------------------------------------------------------------
-
-.. automodule:: finn.custom_op.fpgadataflow.matrixvectoractivation
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
 finn.custom\_op.fpgadataflow.streamingfifo
 -------------------------------------------------
 
diff --git a/docs/finn/source_code/finn.custom_op.rst b/docs/finn/source_code/finn.custom_op.rst
index 20d90a7bb5..cdbe957c71 100644
--- a/docs/finn/source_code/finn.custom_op.rst
+++ b/docs/finn/source_code/finn.custom_op.rst
@@ -9,6 +9,7 @@ Submodules
    :maxdepth: 2
 
    finn.custom_op.fpgadataflow
+   qonnx.custom_op.channels_last
    qonnx.custom_op.general
 
 Custom Op Nodes
diff --git a/docs/finn/source_code/finn.transformation.fpgadataflow.rst b/docs/finn/source_code/finn.transformation.fpgadataflow.rst
index b1e7075bdc..f7137ae347 100644
--- a/docs/finn/source_code/finn.transformation.fpgadataflow.rst
+++ b/docs/finn/source_code/finn.transformation.fpgadataflow.rst
@@ -62,6 +62,14 @@ finn.transformation.fpgadataflow.create\_stitched\_ip
    :undoc-members:
    :show-inheritance:
 
+finn.transformation.fpgadataflow.derive\_characteristic
+------------------------------------------------------------
+
+.. automodule:: finn.transformation.fpgadataflow.derive_characteristic
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
 finn.transformation.fpgadataflow.externalize\_params
 ------------------------------------------------------------
 
@@ -103,6 +111,17 @@ finn.transformation.fpgadataflow.insert\_fifo
    :undoc-members:
    :show-inheritance:
 
+
+finn.transformation.fpgadataflow.insert\_hook
+----------------------------------------------------
+
+.. automodule:: finn.transformation.fpgadataflow.insert_hook
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+
+
 finn.transformation.fpgadataflow.insert\_iodma
 ----------------------------------------------------
 
@@ -154,6 +173,15 @@ finn.transformation.fpgadataflow.minimize\_accumulator\_width
   :show-inheritance:
 
 
+finn.transformation.fpgadataflow.minimize\_weight\_bit\_width
+--------------------------------------------------------------
+
+.. automodule:: finn.transformation.fpgadataflow.minimize_weight_bit_width
+  :members:
+  :undoc-members:
+  :show-inheritance:
+
+
 finn.transformation.fpgadataflow.prepare\_cppsim
 -------------------------------------------------------
 
diff --git a/docs/finn/source_code/finn.transformation.rst b/docs/finn/source_code/finn.transformation.rst
index 6a28eeedb2..f42b595a50 100644
--- a/docs/finn/source_code/finn.transformation.rst
+++ b/docs/finn/source_code/finn.transformation.rst
@@ -20,7 +20,7 @@ Transformation Passes
 Base Class
 ----------
 
-.. automodule:: finn.transformation
+.. automodule:: qonnx.transformation.base
    :members:
    :undoc-members:
    :show-inheritance:
@@ -42,7 +42,7 @@ qonnx.transformation.bipolar\_to\_xnor
    :show-inheritance:
 
 qonnx.transformation.change\_3d\_tensors\_to\_4d
-------------------------------------------------
+-------------------------------------------------
 
 .. automodule:: qonnx.transformation.change_3d_tensors_to_4d
   :members:
@@ -57,8 +57,18 @@ qonnx.transformation.change\_datalayout
   :undoc-members:
   :show-inheritance:
 
+
+qonnx.transformation.channels\_last
+--------------------------------------------
+
+.. automodule:: qonnx.transformation.channels_last
+  :members:
+  :undoc-members:
+  :show-inheritance:
+
+
 qonnx.transformation.create\_generic\_partitions
-------------------------------------------------
+-------------------------------------------------
 
 .. automodule:: qonnx.transformation.create_generic_partitions
   :members:
@@ -171,13 +181,22 @@ qonnx.transformation.merge\_onnx\_models
   :show-inheritance:
 
 
-finn.transformation.move\_reshape
+qonnx.transformation.quant\_constant\_folding
+----------------------------------------------
+
+.. automodule:: qonnx.transformation.quant_constant_folding
+  :members:
+  :undoc-members:
+  :show-inheritance:
+
+
+qonnx.transformation.rebalance\_conv
 ----------------------------------------
 
-.. automodule:: finn.transformation.move_reshape
-   :members:
-   :undoc-members:
-   :show-inheritance:
+.. automodule:: qonnx.transformation.rebalance_conv
+  :members:
+  :undoc-members:
+  :show-inheritance:
 
 qonnx.transformation.remove
 -------------------------------------
@@ -186,3 +205,12 @@ qonnx.transformation.remove
   :members:
   :undoc-members:
   :show-inheritance:
+
+
+finn.transformation.move\_reshape
+----------------------------------------
+
+.. automodule:: finn.transformation.move_reshape
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/finn/source_code/finn.util.rst b/docs/finn/source_code/finn.util.rst
index 8dffa01632..aebd0604f4 100644
--- a/docs/finn/source_code/finn.util.rst
+++ b/docs/finn/source_code/finn.util.rst
@@ -14,6 +14,15 @@ qonnx.util.basic
    :show-inheritance:
 
 
+qonnx.util.cleanup
+----------------------
+
+.. automodule:: qonnx.util.cleanup
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+
 qonnx.util.config
 --------------------
 
@@ -22,6 +31,40 @@ qonnx.util.config
   :undoc-members:
   :show-inheritance:
 
+qonnx.util.exec\_qonnx
+----------------------
+
+.. automodule:: qonnx.util.exec_qonnx
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+qonnx.util.inference\_cost
+--------------------------
+
+.. automodule:: qonnx.util.inference_cost
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+qonnx.util.onnx
+-------------------
+
+.. automodule:: qonnx.util.onnx
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+
+qonnx.util.to\_channels\_last
+------------------------------
+
+.. automodule:: qonnx.util.to_channels_last
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+
 finn.util.basic
 ----------------------
 
@@ -56,14 +99,15 @@ finn.util.fpgadataflow
    :undoc-members:
    :show-inheritance:
 
-finn.util.gdrive
------------------------------
+finn.util.hls
+---------------
 
-.. automodule:: finn.util.gdrive
+.. automodule:: finn.util.hls
   :members:
   :undoc-members:
   :show-inheritance:
 
+
 finn.util.imagenet
 -----------------------------
 
@@ -72,14 +116,6 @@ finn.util.imagenet
   :undoc-members:
   :show-inheritance:
 
-qonnx.util.onnx
----------------------
-
-.. automodule:: qonnx.util.onnx
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
 finn.util.platforms
 --------------------
 
diff --git a/docs/finn/source_code/modules.rst b/docs/finn/source_code/modules.rst
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/docs/finn/source_code/qonnx.custom_op.channels_last.rst b/docs/finn/source_code/qonnx.custom_op.channels_last.rst
new file mode 100644
index 0000000000..3ad10d94a6
--- /dev/null
+++ b/docs/finn/source_code/qonnx.custom_op.channels_last.rst
@@ -0,0 +1,41 @@
+**************************
+Custom Op - Channels Last
+**************************
+
+Channels Last Custom Ops
+=========================
+
+qonnx.custom\_op.channels\_last.base\_wrapped\_op
+--------------------------------------------------
+
+.. automodule:: qonnx.custom_op.channels_last.base_wrapped_op
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+
+qonnx.custom\_op.channels\_last.batch\_normalization
+------------------------------------------------------
+
+.. automodule:: qonnx.custom_op.channels_last.batch_normalization
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+
+qonnx.custom\_op.channels\_last.conv
+--------------------------------------
+
+.. automodule:: qonnx.custom_op.channels_last.conv
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+
+qonnx.custom\_op.channels\_last.max\_pool
+------------------------------------------
+
+.. automodule:: qonnx.custom_op.channels_last.max_pool
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/finn/tutorials.rst b/docs/finn/tutorials.rst
index 110f77c5b1..7ac54501cf 100644
--- a/docs/finn/tutorials.rst
+++ b/docs/finn/tutorials.rst
@@ -46,3 +46,8 @@ The notebooks in this folder are more developer oriented. They should help you t
 * 2_custom_op
 
   * Explains the basics of FINN custom ops and how to define a new one.
+
+FINN Example FPGA Flow Using MNIST Numerals
+============================================
+
+Next to the Jupyter notebooks above there is a tutorial about the command-line build_dataflow `here <https://github.com/Xilinx/finn/tree/main/tutorials/fpga_flow>`_ which shows how to bring a FINN compiled model into the Vivado FPGA design environment.
diff --git a/docs/img/finn-examples-header.png b/docs/img/finn-examples-header.png
deleted file mode 100644
index 50f8fa7761..0000000000
Binary files a/docs/img/finn-examples-header.png and /dev/null differ
diff --git a/docs/img/imagenet.jpg b/docs/img/imagenet.jpg
deleted file mode 100644
index 5cdd5aa303..0000000000
Binary files a/docs/img/imagenet.jpg and /dev/null differ
diff --git a/fetch-repos.sh b/fetch-repos.sh
index 2dd5e51934..5b07d11273 100755
--- a/fetch-repos.sh
+++ b/fetch-repos.sh
@@ -27,15 +27,16 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-QONNX_COMMIT="398a0ecfcb32407c0a3df39246cf6d2bca02886c"
-FINN_EXP_COMMIT="9cbd2787b5160e2b44e0e8164a0df1457dbd5366"
-BREVITAS_COMMIT="a5b71d6de1389d3e7db898fef72e014842670f03"
-PYVERILATOR_COMMIT="64b8294ff1afebb47be76fcad6ae87027e0402c2"
+QONNX_COMMIT="04e24583fb5c1895744801480db3ced8a5b6a914"
+FINN_EXP_COMMIT="0aa7e1c44b20cf085b6fe42cff360f0a832afd2c"
+BREVITAS_COMMIT="9bb26bf2798de210a267d1e4aed4c20087e0e8a5"
+PYVERILATOR_COMMIT="766e457465f5c0dd315490d7b9cc5d74f9a76f4f"
 CNPY_COMMIT="4e8810b1a8637695171ed346ce68f6984e585ef4"
-HLSLIB_COMMIT="e9946e5e56acd85837e8e79224d2bb60764bed69"
-OMX_COMMIT="d1065a788219ca0eb54d5e57600b1f9d7f67d4cc"
+HLSLIB_COMMIT="c17aa478ae574971d115afa9fa4d9c215857d1ac"
+OMX_COMMIT="0b59762f9e4c4f7e5aa535ee9bc29f292434ca7a"
 AVNET_BDF_COMMIT="2d49cfc25766f07792c0b314489f21fe916b639b"
 XIL_BDF_COMMIT="8cf4bb674a919ac34e3d99d8d71a9e60af93d14e"
+KV260_BDF_COMMIT="98e0d3efc901f0b974006bc4370c2a7ad8856c79"
 EXP_BOARD_FILES_MD5="30eecc497c31050bd46d10ea20eba232"
 
 QONNX_URL="https://github.com/fastmachinelearning/qonnx.git"
@@ -47,6 +48,7 @@ HLSLIB_URL="https://github.com/Xilinx/finn-hlslib.git"
 OMX_URL="https://github.com/maltanar/oh-my-xilinx.git"
 AVNET_BDF_URL="https://github.com/Avnet/bdf.git"
 XIL_BDF_URL="https://github.com/Xilinx/XilinxBoardStore.git"
+KV260_BDF_URL="https://github.com/Xilinx/XilinxBoardStore.git"
 
 QONNX_DIR="qonnx"
 FINN_EXP_DIR="finn-experimental"
@@ -57,6 +59,7 @@ HLSLIB_DIR="finn-hlslib"
 OMX_DIR="oh-my-xilinx"
 AVNET_BDF_DIR="avnet-bdf"
 XIL_BDF_DIR="xil-bdf"
+KV260_SOM_BDF_DIR="kv260-som-bdf"
 
 # absolute path to this script, e.g. /home/user/bin/foo.sh
 SCRIPT=$(readlink -f "$0")
@@ -104,6 +107,7 @@ fetch_board_files() {
     unzip -q pynq-z2.zip
     cp -r $SCRIPTPATH/deps/$AVNET_BDF_DIR/* $SCRIPTPATH/deps/board_files/
     cp -r $SCRIPTPATH/deps/$XIL_BDF_DIR/boards/Xilinx/rfsoc2x2 $SCRIPTPATH/deps/board_files/;
+    cp -r $SCRIPTPATH/deps/$KV260_SOM_BDF_DIR/boards/Xilinx/kv260_som $SCRIPTPATH/deps/board_files/;
     cd $OLD_PWD
 }
 
@@ -116,6 +120,7 @@ fetch_repo $HLSLIB_URL $HLSLIB_COMMIT $HLSLIB_DIR
 fetch_repo $OMX_URL $OMX_COMMIT $OMX_DIR
 fetch_repo $AVNET_BDF_URL $AVNET_BDF_COMMIT $AVNET_BDF_DIR
 fetch_repo $XIL_BDF_URL $XIL_BDF_COMMIT $XIL_BDF_DIR
+fetch_repo $KV260_BDF_URL $KV260_BDF_COMMIT $KV260_SOM_BDF_DIR
 
 # download extra Pynq board files and extract if needed
 if [ ! -d "$SCRIPTPATH/deps/board_files" ]; then
diff --git a/finn-rtllib/axi_info/component.xml b/finn-rtllib/axi_info/component.xml
index d22637534f..c7632e2915 100644
--- a/finn-rtllib/axi_info/component.xml
+++ b/finn-rtllib/axi_info/component.xml
@@ -197,6 +197,10 @@
           <spirit:name>ASSOCIATED_BUSIF</spirit:name>
           <spirit:value spirit:id="BUSIFPARAM_VALUE.AP_CLK.ASSOCIATED_BUSIF">s_axi</spirit:value>
         </spirit:parameter>
+        <spirit:parameter>
+          <spirit:name>FREQ_TOLERANCE_HZ</spirit:name>
+          <spirit:value spirit:id="BUSIFPARAM_VALUE.AP_CLK.FREQ_TOLERANCE_HZ">-1</spirit:value>
+        </spirit:parameter>
       </spirit:parameters>
     </spirit:busInterface>
   </spirit:busInterfaces>
@@ -228,7 +232,7 @@
         <spirit:parameters>
           <spirit:parameter>
             <spirit:name>viewChecksum</spirit:name>
-            <spirit:value>7d682dfc</spirit:value>
+            <spirit:value>c9da9874</spirit:value>
           </spirit:parameter>
         </spirit:parameters>
       </spirit:view>
@@ -244,7 +248,7 @@
         <spirit:parameters>
           <spirit:parameter>
             <spirit:name>viewChecksum</spirit:name>
-            <spirit:value>7d682dfc</spirit:value>
+            <spirit:value>c9da9874</spirit:value>
           </spirit:parameter>
         </spirit:parameters>
       </spirit:view>
@@ -258,7 +262,7 @@
         <spirit:parameters>
           <spirit:parameter>
             <spirit:name>viewChecksum</spirit:name>
-            <spirit:value>e11f9727</spirit:value>
+            <spirit:value>1e654f67</spirit:value>
           </spirit:parameter>
         </spirit:parameters>
       </spirit:view>
@@ -607,7 +611,7 @@
       <spirit:file>
         <spirit:name>hdl/axi_info_top.sv</spirit:name>
         <spirit:fileType>systemVerilogSource</spirit:fileType>
-        <spirit:userFileType>CHECKSUM_ec9ff0da</spirit:userFileType>
+        <spirit:userFileType>CHECKSUM_db6ccc10</spirit:userFileType>
       </spirit:file>
     </spirit:fileSet>
     <spirit:fileSet>
@@ -692,17 +696,22 @@
       </xilinx:taxonomies>
       <xilinx:displayName>axi_info_top_v1_0</xilinx:displayName>
       <xilinx:definitionSource>package_project</xilinx:definitionSource>
-      <xilinx:coreRevision>5</xilinx:coreRevision>
-      <xilinx:coreCreationDateTime>2022-05-30T14:16:13Z</xilinx:coreCreationDateTime>
+      <xilinx:coreRevision>6</xilinx:coreRevision>
+      <xilinx:coreCreationDateTime>2023-05-24T06:36:33Z</xilinx:coreCreationDateTime>
     </xilinx:coreExtensions>
     <xilinx:packagingInfo>
-      <xilinx:xilinxVersion>2022.1</xilinx:xilinxVersion>
-      <xilinx:checksum xilinx:scope="busInterfaces" xilinx:value="919b2cd5"/>
+      <xilinx:xilinxVersion>2022.2</xilinx:xilinxVersion>
+      <xilinx:checksum xilinx:scope="busInterfaces" xilinx:value="3233b5e1"/>
       <xilinx:checksum xilinx:scope="memoryMaps" xilinx:value="c930e363"/>
-      <xilinx:checksum xilinx:scope="fileGroups" xilinx:value="5ec5459d"/>
+      <xilinx:checksum xilinx:scope="fileGroups" xilinx:value="28f3ac69"/>
       <xilinx:checksum xilinx:scope="ports" xilinx:value="bd3646cb"/>
       <xilinx:checksum xilinx:scope="hdlParameters" xilinx:value="eab94b69"/>
       <xilinx:checksum xilinx:scope="parameters" xilinx:value="ba692e87"/>
+      <xilinx:targetDRCs>
+        <xilinx:targetDRC xilinx:tool="ipi">
+          <xilinx:targetDRCOption xilinx:name="ignore_freq_hz" xilinx:value="true"/>
+        </xilinx:targetDRC>
+      </xilinx:targetDRCs>
     </xilinx:packagingInfo>
   </spirit:vendorExtensions>
 </spirit:component>
diff --git a/finn-rtllib/axi_info/hdl/axi_info_top.sv b/finn-rtllib/axi_info/hdl/axi_info_top.sv
index ab2cfc8bed..74aebe3ec7 100644
--- a/finn-rtllib/axi_info/hdl/axi_info_top.sv
+++ b/finn-rtllib/axi_info/hdl/axi_info_top.sv
@@ -38,7 +38,10 @@ module axi_info_top #(
 	bit [31:0]  CHECKSUM_COUNT
 )(
 	//- Global Control ------------------
+	(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF s_axi, ASSOCIATED_RESET ap_rst_n" *)
+	(* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk CLK" *)
 	input	logic  ap_clk,
+	(* X_INTERFACE_PARAMETER = "POLARITY ACTIVE_LOW" *)
 	input	logic  ap_rst_n,
 
 	//- AXI Lite ------------------------
diff --git a/finn-rtllib/fmpadding/hdl/axi2we.sv b/finn-rtllib/fmpadding/hdl/axi2we.sv
new file mode 100644
index 0000000000..842ba3632c
--- /dev/null
+++ b/finn-rtllib/fmpadding/hdl/axi2we.sv
@@ -0,0 +1,122 @@
+/******************************************************************************
+ * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief	AXI-Light adapter for trivial write enable interface.
+ * @author	Thomas B. Preußer <tpreusse@amd.com>
+ *****************************************************************************/
+
+module axi2we #(
+	int unsigned  ADDR_BITS
+)(
+	//- Global Control ------------------
+	input	logic  ap_clk,
+	input	logic  ap_rst_n,
+
+	//- AXI Lite ------------------------
+	// Writing
+	input	                 s_axilite_AWVALID,
+	output	                 s_axilite_AWREADY,
+	input	[ADDR_BITS-1:0]  s_axilite_AWADDR,
+
+	input	        s_axilite_WVALID,
+	output	        s_axilite_WREADY,
+	input	[31:0]  s_axilite_WDATA,
+	input	[ 3:0]  s_axilite_WSTRB,
+
+	output	       s_axilite_BVALID,
+	input	       s_axilite_BREADY,
+	output	[1:0]  s_axilite_BRESP,
+
+	// Reading tied to all-ones
+	input	       s_axilite_ARVALID,
+	output	       s_axilite_ARREADY,
+	input	[ADDR_BITS-1:0]  s_axilite_ARADDR,
+
+	output	        s_axilite_RVALID,
+	input	        s_axilite_RREADY,
+	output	[31:0]  s_axilite_RDATA,
+	output	[ 1:0]  s_axilite_RRESP,
+
+	// Write Enable Interface
+	output	logic                  we,
+	output	logic [ADDR_BITS-1:0]  wa,
+	output	logic [         31:0]  wd
+);
+
+	uwire  clk = ap_clk;
+	uwire  rst = !ap_rst_n;
+
+
+	logic  WABusy = 0;
+	logic  WDBusy = 0;
+	logic [ADDR_BITS-1:0]  Addr = 'x;
+	logic [         31:0]  Data = 'x;
+
+	assign	we = WABusy && WDBusy && s_axilite_BREADY;
+	assign	wa = Addr;
+	assign	wd = Data;
+
+	uwire  clr_wr = rst || we;
+	always_ff @(posedge clk) begin
+		if(clr_wr) begin
+			WABusy <= 0;
+			Addr <= 'x;
+			WDBusy <= 0;
+			Data <= 'x;
+		end
+		else begin
+			if(!WABusy) begin
+				WABusy <= s_axilite_AWVALID;
+				Addr   <= s_axilite_AWADDR;
+			end
+			if(!WDBusy) begin
+				WDBusy <= s_axilite_WVALID;
+				Data   <= s_axilite_WDATA;
+			end
+		end
+	end
+	assign	s_axilite_AWREADY = !WABusy;
+	assign	s_axilite_WREADY  = !WDBusy;
+	assign	s_axilite_BVALID  = WABusy && WDBusy;
+	assign	s_axilite_BRESP   = '0; // OK
+
+	// Answer all reads with '1
+	logic  RValid =  0;
+	uwire  clr_rd = rst || (RValid && s_axilite_RREADY);
+	always_ff @(posedge clk) begin
+		if(clr_rd)        RValid <=  0;
+		else if(!RValid)  RValid <= s_axilite_ARVALID;
+	end
+	assign	s_axilite_ARREADY = !RValid;
+	assign	s_axilite_RVALID  = RValid;
+	assign	s_axilite_RDATA   = '1;
+	assign	s_axilite_RRESP   = '0; // OK
+
+endmodule : axi2we
diff --git a/finn-rtllib/fmpadding/hdl/fmpadding.sv b/finn-rtllib/fmpadding/hdl/fmpadding.sv
new file mode 100644
index 0000000000..904c7c381f
--- /dev/null
+++ b/finn-rtllib/fmpadding/hdl/fmpadding.sv
@@ -0,0 +1,224 @@
+/******************************************************************************
+ * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief	Feature map padding.
+ * @author	Thomas B. Preußer <tpreusse@amd.com>
+ *****************************************************************************/
+
+module fmpadding #(
+	int unsigned  XCOUNTER_BITS,
+	int unsigned  YCOUNTER_BITS,
+	int unsigned  NUM_CHANNELS,
+	int unsigned  SIMD,
+	int unsigned  ELEM_BITS,
+	int unsigned  INIT_XON,
+	int unsigned  INIT_XOFF,
+	int unsigned  INIT_XEND,
+	int unsigned  INIT_YON,
+	int unsigned  INIT_YOFF,
+	int unsigned  INIT_YEND,
+
+	localparam int unsigned  STREAM_BITS = 8*(1 + (SIMD*ELEM_BITS-1)/8)
+)(
+	//- Global Control ------------------
+	input	logic  ap_clk,
+	input	logic  ap_rst_n,
+
+	// Parameter Configuration ----------
+	input	logic         we,
+	input	logic [ 4:0]  wa,
+	input	logic [31:0]  wd,
+
+	//- AXI Stream - Input --------------
+	output	logic  s_axis_tready,
+	input	logic  s_axis_tvalid,
+	input	logic [STREAM_BITS-1:0]  s_axis_tdata,
+
+	//- AXI Stream - Output -------------
+	input	logic  m_axis_tready,
+	output	logic  m_axis_tvalid,
+	output	logic [STREAM_BITS-1:0]  m_axis_tdata
+);
+
+	uwire  clk = ap_clk;
+	uwire  rst = !ap_rst_n;
+
+	//-----------------------------------------------------------------------
+	// Parameter Sanity Checking
+	initial begin
+		automatic bit  fail = 0;
+
+		if(XCOUNTER_BITS < $clog2(1+INIT_XEND)) begin
+			$error("XCounter size too small to accommodate end count.");
+			fail = 1;
+		end
+		if(XCOUNTER_BITS < $clog2(1+INIT_XON)) begin
+			$error("XCounter size too small to accommodate ON count.");
+			fail = 1;
+		end
+		if(XCOUNTER_BITS < $clog2(1+INIT_XOFF)) begin
+			$error("XCounter size too small to accommodate OFF count.");
+			fail = 1;
+		end
+		if(YCOUNTER_BITS < $clog2(1+INIT_YEND)) begin
+			$error("YCounter size too small to accommodate end count.");
+			fail = 1;
+		end
+		if(YCOUNTER_BITS < $clog2(1+INIT_YON)) begin
+			$error("YCounter size too small to accommodate ON count.");
+			fail = 1;
+		end
+		if(YCOUNTER_BITS < $clog2(1+INIT_YOFF)) begin
+			$error("YCounter size too small to accommodate OFF count.");
+			fail = 1;
+		end
+
+		if((INIT_XEND < INIT_XON) || (INIT_XOFF <= INIT_XON)) begin
+			$warning("Initial empty X output range.");
+		end
+		if((INIT_YEND < INIT_YON) || (INIT_YOFF <= INIT_YON)) begin
+			$warning("Initial empty Y output range.");
+		end
+
+		if(fail)  $finish();
+	end
+
+	//-----------------------------------------------------------------------
+	// Dynamically configurable state
+	typedef logic [XCOUNTER_BITS-1:0]  xcount_t;
+	xcount_t  XEnd = INIT_XEND;
+	xcount_t  XOn  = INIT_XON;
+	xcount_t  XOff = INIT_XOFF;
+
+	typedef logic [YCOUNTER_BITS-1:0]  ycount_t;
+	ycount_t  YEnd = INIT_YEND;
+	ycount_t  YOn  = INIT_YON;
+	ycount_t  YOff = INIT_YOFF;
+
+	always_ff @(posedge clk) begin
+		if(we) begin
+			unique case(wa)
+			0*4:  XOn  <= wd;
+			1*4:  XOff <= wd;
+			2*4:  XEnd <= wd;
+			3*4:  YOn  <= wd;
+			4*4:  YOff <= wd;
+			5*4:  YEnd <= wd;
+
+			default:  assert(0) else begin
+				$error("Illegal write address.");
+				$stop;
+			end
+			endcase
+		end
+	end
+
+	//-----------------------------------------------------------------------
+	// Cascaded enables for the nested counters: SCount, XCount, YCount
+	uwire  sen;
+	uwire  xen;
+	uwire  yen;
+
+	//- S-Counter: SIMD fold ------------
+	initial begin
+		if((NUM_CHANNELS < 1) || (NUM_CHANNELS % SIMD != 0)) begin
+			$error("Channel count must be SIMD multiple.");
+			$finish;
+		end
+	end
+	// Count SF-2, SF-3, ..., 1, 0, -1
+	localparam int unsigned  SF = NUM_CHANNELS/SIMD;
+	typedef logic [$clog2(SF-1):0]  scount_t;
+	scount_t  SCount = SF-2;
+
+	assign	xen = sen && SCount[$left(SCount)];
+	uwire  sclr = rst || xen;
+	always_ff @(posedge clk) begin
+		if(sclr)      SCount <= SF-2;
+		else if(sen)  SCount <= SCount - 1;
+	end
+
+	//- X-Counter: image width ----------
+	xcount_t  XCount = 0;
+
+	assign	yen = xen && (XCount == XEnd);
+	uwire  xclr = rst || yen;
+	always_ff @(posedge clk) begin
+		if(xclr)      XCount <= 0;
+		else if(xen)  XCount <= XCount + 1;
+	end
+	uwire  xfwd = (XOn <= XCount) && (XCount < XOff);
+
+	//- Y-Counter: image height ---------
+	ycount_t  YCount = 0;
+
+	uwire  yclr = rst || (yen && (YCount == YEnd));
+	always_ff @(posedge clk) begin
+		if(yclr)      YCount <= 0;
+		else if(yen)  YCount <= YCount + 1;
+	end
+	uwire  yfwd = (YOn <= YCount) && (YCount < YOff);
+
+	//-----------------------------------------------------------------------
+	// Input forwarding and edge padding
+	typedef struct {
+		logic  vld;
+		logic [STREAM_BITS-1:0]  dat;
+	} buf_t;
+	buf_t  A = '{ vld: 0, dat: 'x };
+	buf_t  B = '{ vld: 0, dat: 'x };
+
+	uwire  fwd = xfwd && yfwd;
+	assign	sen = (m_axis_tready || !B.vld) && (s_axis_tvalid || A.vld || !fwd);
+	assign	s_axis_tready = !A.vld;
+	assign	m_axis_tvalid =  B.vld;
+	assign	m_axis_tdata  =  B.dat;
+
+	always_ff @(posedge clk) begin
+		if(rst) begin
+			B <= '{ vld: 0, dat: 'x };
+		end
+		else if(m_axis_tready || !B.vld) begin
+			B.vld <= s_axis_tvalid || A.vld || !fwd;
+			B.dat <= !fwd? '0 : A.vld? A.dat : s_axis_tdata;
+		end
+	end
+
+	always_ff @(posedge clk) begin
+		if(rst) begin
+			A <= '{ vld: 0, dat: 'x };
+		end
+		else begin
+			A.vld <= (A.vld || s_axis_tvalid) && ((B.vld && !m_axis_tready) || !fwd);
+			if(!A.vld)  A.dat <= s_axis_tdata;
+		end
+	end
+
+endmodule : fmpadding
diff --git a/finn-rtllib/fmpadding/hdl/fmpadding_axi.sv b/finn-rtllib/fmpadding/hdl/fmpadding_axi.sv
new file mode 100644
index 0000000000..5948341d00
--- /dev/null
+++ b/finn-rtllib/fmpadding/hdl/fmpadding_axi.sv
@@ -0,0 +1,123 @@
+/******************************************************************************
+ * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief	Feature map padding.
+ * @author	Thomas B. Preußer <tpreusse@amd.com>
+ *****************************************************************************/
+
+module fmpadding_axi #(
+	int unsigned  XCOUNTER_BITS,
+	int unsigned  YCOUNTER_BITS,
+	int unsigned  NUM_CHANNELS,
+	int unsigned  SIMD,
+	int unsigned  ELEM_BITS,
+	int unsigned  INIT_XON,
+	int unsigned  INIT_XOFF,
+	int unsigned  INIT_XEND,
+	int unsigned  INIT_YON,
+	int unsigned  INIT_YOFF,
+	int unsigned  INIT_YEND,
+
+	localparam int unsigned  STREAM_BITS = 8*(1 + (SIMD*ELEM_BITS-1)/8)
+)(
+	//- Global Control ------------------
+	input	logic  ap_clk,
+	input	logic  ap_rst_n,
+
+	//- AXI Lite ------------------------
+	// Writing
+	input	       s_axilite_AWVALID,
+	output	       s_axilite_AWREADY,
+	input	[4:0]  s_axilite_AWADDR,
+
+	input	        s_axilite_WVALID,
+	output	        s_axilite_WREADY,
+	input	[31:0]  s_axilite_WDATA,
+	input	[ 3:0]  s_axilite_WSTRB,
+
+	output	       s_axilite_BVALID,
+	input	       s_axilite_BREADY,
+	output	[1:0]  s_axilite_BRESP,
+
+	// Reading
+	input	       s_axilite_ARVALID,
+	output	       s_axilite_ARREADY,
+	input	[4:0]  s_axilite_ARADDR,
+
+	output	        s_axilite_RVALID,
+	input	        s_axilite_RREADY,
+	output	[31:0]  s_axilite_RDATA,
+	output	[ 1:0]  s_axilite_RRESP,
+
+	//- AXI Stream - Input --------------
+	output	logic  s_axis_tready,
+	input	logic  s_axis_tvalid,
+	input	logic [STREAM_BITS-1:0]  s_axis_tdata,
+
+	//- AXI Stream - Output -------------
+	input	logic  m_axis_tready,
+	output	logic  m_axis_tvalid,
+	output	logic [STREAM_BITS-1:0]  m_axis_tdata
+);
+
+	// AXI-Lite Adapter
+	uwire         we;
+	uwire [ 4:0]  wa;
+	uwire [31:0]  wd;
+	axi2we #(.ADDR_BITS(5)) axilight_adapter (
+		.ap_clk, .ap_rst_n,
+
+		.s_axilite_AWVALID, .s_axilite_AWREADY, .s_axilite_AWADDR,
+		.s_axilite_WVALID, .s_axilite_WREADY, .s_axilite_WDATA, .s_axilite_WSTRB,
+		.s_axilite_BVALID, .s_axilite_BREADY, .s_axilite_BRESP,
+
+		.s_axilite_ARVALID, .s_axilite_ARREADY, .s_axilite_ARADDR,
+		.s_axilite_RVALID, .s_axilite_RREADY, .s_axilite_RDATA, .s_axilite_RRESP,
+
+		.we, .wa, .wd
+	);
+
+	// Actual Padding
+	fmpadding #(
+		.XCOUNTER_BITS(XCOUNTER_BITS), .YCOUNTER_BITS(YCOUNTER_BITS),
+		.NUM_CHANNELS(NUM_CHANNELS), .SIMD(SIMD),
+		.INIT_XON(INIT_XON), .INIT_XOFF(INIT_XOFF), .INIT_XEND(INIT_XEND),
+		.INIT_YON(INIT_YON), .INIT_YOFF(INIT_YOFF), .INIT_YEND(INIT_YEND),
+		.ELEM_BITS(ELEM_BITS)
+	) padding (
+		.ap_clk, .ap_rst_n,
+
+		.we, .wa, .wd,
+
+		.s_axis_tready, .s_axis_tvalid, .s_axis_tdata,
+		.m_axis_tready, .m_axis_tvalid, .m_axis_tdata
+	);
+
+endmodule : fmpadding_axi
diff --git a/finn-rtllib/fmpadding/hdl/fmpadding_axi_tb.sv b/finn-rtllib/fmpadding/hdl/fmpadding_axi_tb.sv
new file mode 100644
index 0000000000..741689b3a7
--- /dev/null
+++ b/finn-rtllib/fmpadding/hdl/fmpadding_axi_tb.sv
@@ -0,0 +1,154 @@
+
+module fmpadding_axi_tb #(
+	int unsigned  XCOUNTER_BITS = 8,
+	int unsigned  YCOUNTER_BITS = 8,
+	int unsigned  NUM_CHANNELS  = 4,
+	int unsigned  SIMD          = 2,
+	int unsigned  ELEM_BITS     = 4
+)();
+	localparam int unsigned  STREAM_BITS = 8*(1 + (SIMD*ELEM_BITS-1)/8);
+
+	//- Global Control ------------------
+	logic  clk = 0;
+	always #5ns clk = !clk;
+	logic  rst;
+
+	// AXI-Light for Parameter Configuration
+	logic	       s_axilite_AWVALID;
+	uwire	       s_axilite_AWREADY;
+	logic	[2:0]  s_axilite_AWADDR;
+
+	logic	        s_axilite_WVALID;
+	uwire	        s_axilite_WREADY;
+	logic	[31:0]  s_axilite_WDATA;
+
+	//- AXI Stream - Input --------------
+	uwire  s_axis_tready;
+	logic  s_axis_tvalid;
+	logic [STREAM_BITS-1:0]  s_axis_tdata;
+
+	//- AXI Stream - Output -------------
+	logic  m_axis_tready;
+	uwire  m_axis_tvalid;
+	uwire [STREAM_BITS-1:0]  m_axis_tdata;
+
+
+	// DUT
+	fmpadding_axi #(
+		.XCOUNTER_BITS(XCOUNTER_BITS),
+		.YCOUNTER_BITS(YCOUNTER_BITS),
+		.NUM_CHANNELS(NUM_CHANNELS),
+		.SIMD(SIMD),
+		.INIT_XON(0), .INIT_XOFF(0), .INIT_XEND(0),
+		.INIT_YON(0), .INIT_YOFF(0), .INIT_YEND(0),
+		.ELEM_BITS(ELEM_BITS)
+	) dut (
+		.ap_clk(clk), .ap_rst_n(!rst),
+
+		.s_axilite_AWVALID, .s_axilite_AWREADY, .s_axilite_AWADDR,
+		.s_axilite_WVALID, .s_axilite_WREADY, .s_axilite_WDATA, .s_axilite_WSTRB('1),
+		.s_axilite_BVALID(), .s_axilite_BREADY('1),	.s_axilite_BRESP(),
+		.s_axilite_ARVALID('0), .s_axilite_ARREADY(), .s_axilite_ARADDR('x),
+		.s_axilite_RVALID(), .s_axilite_RREADY('0), .s_axilite_RDATA(), .s_axilite_RRESP(),
+
+		.s_axis_tready, .s_axis_tvalid, .s_axis_tdata,
+		.m_axis_tready, .m_axis_tvalid, .m_axis_tdata
+	);
+
+	// Stimuli
+	localparam int unsigned  IMAGES = 2;
+	localparam int unsigned  XSIZE = 10;
+	localparam int unsigned  YSIZE =  7;
+	localparam int unsigned  PAD_LEFT   = 2;
+	localparam int unsigned  PAD_RIGHT  = 3;
+	localparam int unsigned  PAD_TOP    = 1;
+	localparam int unsigned  PAD_BOTTOM = 2;
+
+	task axi_write(input logic [2:0]  wa, input logic [31:0]  wd);
+		s_axilite_AWVALID <= 1;
+		s_axilite_AWADDR <= wa;
+		@(posedge clk iff s_axilite_AWREADY);
+		s_axilite_AWVALID <= 0;
+		s_axilite_AWADDR <= 'x;
+
+		s_axilite_WVALID <= 1;
+		s_axilite_WDATA <= wd;
+		@(posedge clk iff s_axilite_WREADY);
+		s_axilite_WVALID <= 0;
+		s_axilite_WDATA <= 'x;
+	endtask : axi_write
+
+
+	initial begin
+		s_axilite_AWVALID = 0;
+		s_axilite_AWADDR = 'x;
+		s_axilite_WVALID = 0;
+		s_axilite_WDATA = 'x;
+
+		s_axis_tvalid =  0;
+		s_axis_tdata  = 'x;
+
+		// Configure Parameters
+		rst = 0;
+		@(posedge clk);
+		/* XOn  */	axi_write(0, PAD_LEFT);
+		/* XOff */	axi_write(1, XSIZE - PAD_RIGHT);
+		/* XEnd */	axi_write(2, XSIZE - 1);
+		/* YOn  */	axi_write(4, PAD_TOP);
+		/* YOff */	axi_write(5, YSIZE - PAD_BOTTOM);
+		/* YEnd */	axi_write(6, YSIZE - 1);
+		@(posedge clk);
+		rst <= 1;
+		@(posedge clk);
+		rst <= 0;
+		@(posedge clk);
+
+		// Feed data input
+		s_axis_tvalid <= 1;
+		for(int unsigned  i = 0; i < IMAGES * (XSIZE-PAD_LEFT-PAD_RIGHT) * (YSIZE-PAD_TOP-PAD_BOTTOM) * (NUM_CHANNELS/SIMD); i++) begin
+			s_axis_tdata  <= i;
+			@(posedge clk iff s_axis_tready);
+			if($urandom()%5 == 0) begin
+				s_axis_tvalid <=  0;
+				s_axis_tdata  <= 'x;
+				@(posedge clk);
+				s_axis_tvalid <=  1;
+			end
+		end
+		s_axis_tvalid <=  0;
+		s_axis_tdata  <= 'x;
+	end
+
+	// Output Throttler
+	initial begin
+		m_axis_tready =  0;
+		@(posedge clk iff !rst);
+		m_axis_tready <= 1;
+		forever @(posedge clk iff m_axis_tvalid) begin
+			m_axis_tready <= 0;
+			repeat(4-$clog2(1+$urandom()%15)) @(posedge clk);
+			m_axis_tready <= 1;
+		end
+	end
+
+	// Output logger
+	initial begin
+		@(negedge rst);
+		repeat(IMAGES) begin
+			for(int unsigned  y = 0; y < YSIZE; y++) begin
+				for(int unsigned  x = 0; x < XSIZE; x++) begin
+					automatic string  delim = " ";
+					for(int unsigned  s = 0; s < NUM_CHANNELS/SIMD; s++) begin
+						@(posedge clk iff m_axis_tvalid && m_axis_tready);
+						$write("%s%02X", delim, m_axis_tdata);
+						delim = ":";
+					end
+				end
+				$display();
+			end
+			$display("----");
+		end
+		$finish;
+	end
+
+endmodule : fmpadding_axi_tb
diff --git a/finn-rtllib/fmpadding/hdl/fmpadding_template.v b/finn-rtllib/fmpadding/hdl/fmpadding_template.v
new file mode 100644
index 0000000000..0b0f40f86a
--- /dev/null
+++ b/finn-rtllib/fmpadding/hdl/fmpadding_template.v
@@ -0,0 +1,118 @@
+/******************************************************************************
+ * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+module $TOP_MODULE_NAME$(
+//- Global Control ------------------
+(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V:s_axilite" *)
+input	ap_clk,
+(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V:s_axilite" *)
+input	ap_rst_n,
+
+//- AXI Lite ------------------------
+// Writing
+input	       s_axilite_AWVALID,
+output	       s_axilite_AWREADY,
+input	[4:0]  s_axilite_AWADDR,
+
+input	        s_axilite_WVALID,
+output	        s_axilite_WREADY,
+input	[31:0]  s_axilite_WDATA,
+input	[ 3:0]  s_axilite_WSTRB,
+
+output	       s_axilite_BVALID,
+input	       s_axilite_BREADY,
+output	[1:0]  s_axilite_BRESP,
+
+// Reading
+input	       s_axilite_ARVALID,
+output	       s_axilite_ARREADY,
+input	[4:0]  s_axilite_ARADDR,
+
+output	        s_axilite_RVALID,
+input	        s_axilite_RREADY,
+output	[31:0]  s_axilite_RDATA,
+output	[ 1:0]  s_axilite_RRESP,
+
+//- AXI Stream - Input --------------
+output	in0_V_TREADY,
+input	in0_V_TVALID,
+input	[$STREAM_BITS$-1:0]  in0_V_TDATA,
+
+//- AXI Stream - Output -------------
+input	out_V_TREADY,
+output	out_V_TVALID,
+output	[$STREAM_BITS$-1:0]  out_V_TDATA
+);
+
+
+fmpadding_axi #(
+.XCOUNTER_BITS($XCOUNTER_BITS$),
+.YCOUNTER_BITS($YCOUNTER_BITS$),
+.NUM_CHANNELS($NUM_CHANNELS$),
+.SIMD($SIMD$),
+.ELEM_BITS($ELEM_BITS$),
+.INIT_XON($INIT_XON$),
+.INIT_XOFF($INIT_XOFF$),
+.INIT_XEND($INIT_XEND$),
+.INIT_YON($INIT_YON$),
+.INIT_YOFF($INIT_YOFF$),
+.INIT_YEND($INIT_YEND$)
+)
+$TOP_MODULE_NAME$_impl
+(
+ .ap_clk(ap_clk),
+ .ap_rst_n(ap_rst_n),
+ .s_axilite_AWVALID(s_axilite_AWVALID),
+ .s_axilite_AWREADY(s_axilite_AWREADY),
+ .s_axilite_AWADDR(s_axilite_AWADDR),
+ .s_axilite_WVALID(s_axilite_WVALID),
+ .s_axilite_WREADY(s_axilite_WREADY),
+ .s_axilite_WDATA(s_axilite_WDATA),
+ .s_axilite_WSTRB(s_axilite_WSTRB),
+ .s_axilite_BVALID(s_axilite_BVALID),
+ .s_axilite_BREADY(s_axilite_BREADY),
+ .s_axilite_BRESP(s_axilite_BRESP),
+ .s_axilite_ARVALID(s_axilite_ARVALID),
+ .s_axilite_ARREADY(s_axilite_ARREADY),
+ .s_axilite_ARADDR(s_axilite_ARADDR),
+ .s_axilite_RVALID(s_axilite_RVALID),
+ .s_axilite_RREADY(s_axilite_RREADY),
+ .s_axilite_RDATA(s_axilite_RDATA),
+ .s_axilite_RRESP(s_axilite_RRESP),
+ .s_axis_tready(in0_V_TREADY),
+ .s_axis_tvalid(in0_V_TVALID),
+ .s_axis_tdata(in0_V_TDATA),
+ .m_axis_tready(out_V_TREADY),
+ .m_axis_tvalid(out_V_TVALID),
+ .m_axis_tdata(out_V_TDATA)
+);
+
+endmodule
diff --git a/finn-rtllib/memstream/component.xml b/finn-rtllib/memstream/component.xml
index 63a8540a76..722da1d803 100644
--- a/finn-rtllib/memstream/component.xml
+++ b/finn-rtllib/memstream/component.xml
@@ -1,7 +1,7 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <spirit:component xmlns:xilinx="http://www.xilinx.com" xmlns:spirit="http://www.spiritconsortium.org/XMLSchema/SPIRIT/1685-2009" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
-  <spirit:vendor>xilinx.com</spirit:vendor>
-  <spirit:library>user</spirit:library>
+  <spirit:vendor>amd.com</spirit:vendor>
+  <spirit:library>finn</spirit:library>
   <spirit:name>memstream</spirit:name>
   <spirit:version>1.0</spirit:version>
   <spirit:busInterfaces>
@@ -37,201 +37,6 @@
         </spirit:portMap>
       </spirit:portMaps>
     </spirit:busInterface>
-    <spirit:busInterface>
-      <spirit:name>m_axis_1</spirit:name>
-      <spirit:busType spirit:vendor="xilinx.com" spirit:library="interface" spirit:name="axis" spirit:version="1.0"/>
-      <spirit:abstractionType spirit:vendor="xilinx.com" spirit:library="interface" spirit:name="axis_rtl" spirit:version="1.0"/>
-      <spirit:master/>
-      <spirit:portMaps>
-        <spirit:portMap>
-          <spirit:logicalPort>
-            <spirit:name>TDATA</spirit:name>
-          </spirit:logicalPort>
-          <spirit:physicalPort>
-            <spirit:name>m_axis_1_tdata</spirit:name>
-          </spirit:physicalPort>
-        </spirit:portMap>
-        <spirit:portMap>
-          <spirit:logicalPort>
-            <spirit:name>TVALID</spirit:name>
-          </spirit:logicalPort>
-          <spirit:physicalPort>
-            <spirit:name>m_axis_1_tvalid</spirit:name>
-          </spirit:physicalPort>
-        </spirit:portMap>
-        <spirit:portMap>
-          <spirit:logicalPort>
-            <spirit:name>TREADY</spirit:name>
-          </spirit:logicalPort>
-          <spirit:physicalPort>
-            <spirit:name>m_axis_1_tready</spirit:name>
-          </spirit:physicalPort>
-        </spirit:portMap>
-      </spirit:portMaps>
-      <spirit:vendorExtensions>
-        <xilinx:busInterfaceInfo>
-          <xilinx:enablement>
-            <xilinx:isEnabled xilinx:resolve="dependent" xilinx:id="BUSIF_ENABLEMENT.m_axis_1" xilinx:dependency="$NSTREAMS>=2">true</xilinx:isEnabled>
-          </xilinx:enablement>
-        </xilinx:busInterfaceInfo>
-      </spirit:vendorExtensions>
-    </spirit:busInterface>
-    <spirit:busInterface>
-      <spirit:name>m_axis_2</spirit:name>
-      <spirit:busType spirit:vendor="xilinx.com" spirit:library="interface" spirit:name="axis" spirit:version="1.0"/>
-      <spirit:abstractionType spirit:vendor="xilinx.com" spirit:library="interface" spirit:name="axis_rtl" spirit:version="1.0"/>
-      <spirit:master/>
-      <spirit:portMaps>
-        <spirit:portMap>
-          <spirit:logicalPort>
-            <spirit:name>TDATA</spirit:name>
-          </spirit:logicalPort>
-          <spirit:physicalPort>
-            <spirit:name>m_axis_2_tdata</spirit:name>
-          </spirit:physicalPort>
-        </spirit:portMap>
-        <spirit:portMap>
-          <spirit:logicalPort>
-            <spirit:name>TVALID</spirit:name>
-          </spirit:logicalPort>
-          <spirit:physicalPort>
-            <spirit:name>m_axis_2_tvalid</spirit:name>
-          </spirit:physicalPort>
-        </spirit:portMap>
-        <spirit:portMap>
-          <spirit:logicalPort>
-            <spirit:name>TREADY</spirit:name>
-          </spirit:logicalPort>
-          <spirit:physicalPort>
-            <spirit:name>m_axis_2_tready</spirit:name>
-          </spirit:physicalPort>
-        </spirit:portMap>
-      </spirit:portMaps>
-      <spirit:vendorExtensions>
-        <xilinx:busInterfaceInfo>
-          <xilinx:enablement>
-            <xilinx:isEnabled xilinx:resolve="dependent" xilinx:id="BUSIF_ENABLEMENT.m_axis_2" xilinx:dependency="$NSTREAMS>=3">true</xilinx:isEnabled>
-          </xilinx:enablement>
-        </xilinx:busInterfaceInfo>
-      </spirit:vendorExtensions>
-    </spirit:busInterface>
-    <spirit:busInterface>
-      <spirit:name>m_axis_3</spirit:name>
-      <spirit:busType spirit:vendor="xilinx.com" spirit:library="interface" spirit:name="axis" spirit:version="1.0"/>
-      <spirit:abstractionType spirit:vendor="xilinx.com" spirit:library="interface" spirit:name="axis_rtl" spirit:version="1.0"/>
-      <spirit:master/>
-      <spirit:portMaps>
-        <spirit:portMap>
-          <spirit:logicalPort>
-            <spirit:name>TDATA</spirit:name>
-          </spirit:logicalPort>
-          <spirit:physicalPort>
-            <spirit:name>m_axis_3_tdata</spirit:name>
-          </spirit:physicalPort>
-        </spirit:portMap>
-        <spirit:portMap>
-          <spirit:logicalPort>
-            <spirit:name>TVALID</spirit:name>
-          </spirit:logicalPort>
-          <spirit:physicalPort>
-            <spirit:name>m_axis_3_tvalid</spirit:name>
-          </spirit:physicalPort>
-        </spirit:portMap>
-        <spirit:portMap>
-          <spirit:logicalPort>
-            <spirit:name>TREADY</spirit:name>
-          </spirit:logicalPort>
-          <spirit:physicalPort>
-            <spirit:name>m_axis_3_tready</spirit:name>
-          </spirit:physicalPort>
-        </spirit:portMap>
-      </spirit:portMaps>
-      <spirit:vendorExtensions>
-        <xilinx:busInterfaceInfo>
-          <xilinx:enablement>
-            <xilinx:isEnabled xilinx:resolve="dependent" xilinx:id="BUSIF_ENABLEMENT.m_axis_3" xilinx:dependency="$NSTREAMS>=4">true</xilinx:isEnabled>
-          </xilinx:enablement>
-        </xilinx:busInterfaceInfo>
-      </spirit:vendorExtensions>
-    </spirit:busInterface>
-    <spirit:busInterface>
-      <spirit:name>m_axis_4</spirit:name>
-      <spirit:busType spirit:vendor="xilinx.com" spirit:library="interface" spirit:name="axis" spirit:version="1.0"/>
-      <spirit:abstractionType spirit:vendor="xilinx.com" spirit:library="interface" spirit:name="axis_rtl" spirit:version="1.0"/>
-      <spirit:master/>
-      <spirit:portMaps>
-        <spirit:portMap>
-          <spirit:logicalPort>
-            <spirit:name>TDATA</spirit:name>
-          </spirit:logicalPort>
-          <spirit:physicalPort>
-            <spirit:name>m_axis_4_tdata</spirit:name>
-          </spirit:physicalPort>
-        </spirit:portMap>
-        <spirit:portMap>
-          <spirit:logicalPort>
-            <spirit:name>TVALID</spirit:name>
-          </spirit:logicalPort>
-          <spirit:physicalPort>
-            <spirit:name>m_axis_4_tvalid</spirit:name>
-          </spirit:physicalPort>
-        </spirit:portMap>
-        <spirit:portMap>
-          <spirit:logicalPort>
-            <spirit:name>TREADY</spirit:name>
-          </spirit:logicalPort>
-          <spirit:physicalPort>
-            <spirit:name>m_axis_4_tready</spirit:name>
-          </spirit:physicalPort>
-        </spirit:portMap>
-      </spirit:portMaps>
-      <spirit:vendorExtensions>
-        <xilinx:busInterfaceInfo>
-          <xilinx:enablement>
-            <xilinx:isEnabled xilinx:resolve="dependent" xilinx:id="BUSIF_ENABLEMENT.m_axis_4" xilinx:dependency="$NSTREAMS>=5">true</xilinx:isEnabled>
-          </xilinx:enablement>
-        </xilinx:busInterfaceInfo>
-      </spirit:vendorExtensions>
-    </spirit:busInterface>
-    <spirit:busInterface>
-      <spirit:name>m_axis_5</spirit:name>
-      <spirit:busType spirit:vendor="xilinx.com" spirit:library="interface" spirit:name="axis" spirit:version="1.0"/>
-      <spirit:abstractionType spirit:vendor="xilinx.com" spirit:library="interface" spirit:name="axis_rtl" spirit:version="1.0"/>
-      <spirit:master/>
-      <spirit:portMaps>
-        <spirit:portMap>
-          <spirit:logicalPort>
-            <spirit:name>TDATA</spirit:name>
-          </spirit:logicalPort>
-          <spirit:physicalPort>
-            <spirit:name>m_axis_5_tdata</spirit:name>
-          </spirit:physicalPort>
-        </spirit:portMap>
-        <spirit:portMap>
-          <spirit:logicalPort>
-            <spirit:name>TVALID</spirit:name>
-          </spirit:logicalPort>
-          <spirit:physicalPort>
-            <spirit:name>m_axis_5_tvalid</spirit:name>
-          </spirit:physicalPort>
-        </spirit:portMap>
-        <spirit:portMap>
-          <spirit:logicalPort>
-            <spirit:name>TREADY</spirit:name>
-          </spirit:logicalPort>
-          <spirit:physicalPort>
-            <spirit:name>m_axis_5_tready</spirit:name>
-          </spirit:physicalPort>
-        </spirit:portMap>
-      </spirit:portMaps>
-      <spirit:vendorExtensions>
-        <xilinx:busInterfaceInfo>
-          <xilinx:enablement>
-            <xilinx:isEnabled xilinx:resolve="dependent" xilinx:id="BUSIF_ENABLEMENT.m_axis_5" xilinx:dependency="$NSTREAMS = 6">true</xilinx:isEnabled>
-          </xilinx:enablement>
-        </xilinx:busInterfaceInfo>
-      </spirit:vendorExtensions>
-    </spirit:busInterface>
     <spirit:busInterface>
       <spirit:name>s_axilite</spirit:name>
       <spirit:busType spirit:vendor="xilinx.com" spirit:library="interface" spirit:name="aximm" spirit:version="1.0"/>
@@ -393,16 +198,9 @@
           </spirit:physicalPort>
         </spirit:portMap>
       </spirit:portMaps>
-      <spirit:vendorExtensions>
-        <xilinx:busInterfaceInfo>
-          <xilinx:enablement>
-            <xilinx:isEnabled xilinx:resolve="dependent" xilinx:id="BUSIF_ENABLEMENT.s_axilite" xilinx:dependency="$CONFIG_EN = 1">true</xilinx:isEnabled>
-          </xilinx:enablement>
-        </xilinx:busInterfaceInfo>
-      </spirit:vendorExtensions>
     </spirit:busInterface>
     <spirit:busInterface>
-      <spirit:name>aresetn</spirit:name>
+      <spirit:name>ap_rst_n</spirit:name>
       <spirit:busType spirit:vendor="xilinx.com" spirit:library="signal" spirit:name="reset" spirit:version="1.0"/>
       <spirit:abstractionType spirit:vendor="xilinx.com" spirit:library="signal" spirit:name="reset_rtl" spirit:version="1.0"/>
       <spirit:slave/>
@@ -412,19 +210,19 @@
             <spirit:name>RST</spirit:name>
           </spirit:logicalPort>
           <spirit:physicalPort>
-            <spirit:name>aresetn</spirit:name>
+            <spirit:name>ap_rst_n</spirit:name>
           </spirit:physicalPort>
         </spirit:portMap>
       </spirit:portMaps>
       <spirit:parameters>
         <spirit:parameter>
           <spirit:name>POLARITY</spirit:name>
-          <spirit:value spirit:id="BUSIFPARAM_VALUE.ARESETN.POLARITY" spirit:choiceRef="choice_list_9d8b0d81">ACTIVE_LOW</spirit:value>
+          <spirit:value spirit:id="BUSIFPARAM_VALUE.AP_RST_N.POLARITY" spirit:choiceRef="choice_list_9d8b0d81">ACTIVE_LOW</spirit:value>
         </spirit:parameter>
       </spirit:parameters>
     </spirit:busInterface>
     <spirit:busInterface>
-      <spirit:name>aclk</spirit:name>
+      <spirit:name>ap_clk</spirit:name>
       <spirit:busType spirit:vendor="xilinx.com" spirit:library="signal" spirit:name="clock" spirit:version="1.0"/>
       <spirit:abstractionType spirit:vendor="xilinx.com" spirit:library="signal" spirit:name="clock_rtl" spirit:version="1.0"/>
       <spirit:slave/>
@@ -434,18 +232,22 @@
             <spirit:name>CLK</spirit:name>
           </spirit:logicalPort>
           <spirit:physicalPort>
-            <spirit:name>aclk</spirit:name>
+            <spirit:name>ap_clk</spirit:name>
           </spirit:physicalPort>
         </spirit:portMap>
       </spirit:portMaps>
       <spirit:parameters>
+        <spirit:parameter>
+          <spirit:name>ASSOCIATED_RESET</spirit:name>
+          <spirit:value spirit:id="BUSIFPARAM_VALUE.AP_CLK.ASSOCIATED_RESET">ap_rst_n</spirit:value>
+        </spirit:parameter>
         <spirit:parameter>
           <spirit:name>ASSOCIATED_BUSIF</spirit:name>
-          <spirit:value spirit:id="BUSIFPARAM_VALUE.ACLK.ASSOCIATED_BUSIF">m_axis_0:m_axis_1:m_axis_2:m_axis_3:m_axis_4:m_axis_5:s_axilite</spirit:value>
+          <spirit:value spirit:id="BUSIFPARAM_VALUE.AP_CLK.ASSOCIATED_BUSIF">m_axis_0:s_axilite</spirit:value>
         </spirit:parameter>
         <spirit:parameter>
-          <spirit:name>ASSOCIATED_RESET</spirit:name>
-          <spirit:value spirit:id="BUSIFPARAM_VALUE.ACLK.ASSOCIATED_RESET">aresetn</spirit:value>
+          <spirit:name>FREQ_TOLERANCE_HZ</spirit:name>
+          <spirit:value spirit:id="BUSIFPARAM_VALUE.AP_CLK.FREQ_TOLERANCE_HZ">-1</spirit:value>
         </spirit:parameter>
       </spirit:parameters>
     </spirit:busInterface>
@@ -453,11 +255,13 @@
   <spirit:memoryMaps>
     <spirit:memoryMap>
       <spirit:name>interface_aximm</spirit:name>
+      <spirit:displayName>interface_aximm</spirit:displayName>
       <spirit:addressBlock>
         <spirit:name>reg0</spirit:name>
-        <spirit:baseAddress spirit:format="bitString" spirit:resolve="user" spirit:bitStringLength="32">0</spirit:baseAddress>
-        <spirit:range spirit:format="long" spirit:resolve="dependent" spirit:dependency="pow(2,(spirit:decode(id(&apos;MODELPARAM_VALUE.AXILITE_ADDR_WIDTH&apos;)) - 1) + 1)" spirit:minimum="4096" spirit:rangeType="long">65536</spirit:range>
-        <spirit:width spirit:format="long" spirit:resolve="user">32</spirit:width>
+        <spirit:displayName>reg0</spirit:displayName>
+        <spirit:baseAddress spirit:format="bitString" spirit:bitStringLength="1">0x0</spirit:baseAddress>
+        <spirit:range spirit:format="long" spirit:resolve="dependent" spirit:dependency="pow(2,(spirit:decode(id(&apos;MODELPARAM_VALUE.AXILITE_ADDR_WIDTH&apos;)) - 1) - 0 + 1)" spirit:minimum="4096" spirit:rangeType="long">4096</spirit:range>
+        <spirit:width spirit:format="long">32</spirit:width>
         <spirit:usage>register</spirit:usage>
       </spirit:addressBlock>
     </spirit:memoryMap>
@@ -468,15 +272,15 @@
         <spirit:name>xilinx_anylanguagesynthesis</spirit:name>
         <spirit:displayName>Synthesis</spirit:displayName>
         <spirit:envIdentifier>:vivado.xilinx.com:synthesis</spirit:envIdentifier>
-        <spirit:language>Verilog</spirit:language>
-        <spirit:modelName>memstream</spirit:modelName>
+        <spirit:language>SystemVerilog</spirit:language>
+        <spirit:modelName>memstream_axi_wrapper</spirit:modelName>
         <spirit:fileSetRef>
           <spirit:localName>xilinx_anylanguagesynthesis_view_fileset</spirit:localName>
         </spirit:fileSetRef>
         <spirit:parameters>
           <spirit:parameter>
             <spirit:name>viewChecksum</spirit:name>
-            <spirit:value>1fc5a310</spirit:value>
+            <spirit:value>04464096</spirit:value>
           </spirit:parameter>
         </spirit:parameters>
       </spirit:view>
@@ -484,15 +288,27 @@
         <spirit:name>xilinx_anylanguagebehavioralsimulation</spirit:name>
         <spirit:displayName>Simulation</spirit:displayName>
         <spirit:envIdentifier>:vivado.xilinx.com:simulation</spirit:envIdentifier>
-        <spirit:language>Verilog</spirit:language>
-        <spirit:modelName>memstream</spirit:modelName>
+        <spirit:language>SystemVerilog</spirit:language>
+        <spirit:modelName>memstream_axi_wrapper</spirit:modelName>
         <spirit:fileSetRef>
           <spirit:localName>xilinx_anylanguagebehavioralsimulation_view_fileset</spirit:localName>
         </spirit:fileSetRef>
         <spirit:parameters>
           <spirit:parameter>
             <spirit:name>viewChecksum</spirit:name>
-            <spirit:value>d02d9990</spirit:value>
+            <spirit:value>9e058959</spirit:value>
+          </spirit:parameter>
+        </spirit:parameters>
+      </spirit:view>
+      <spirit:view>
+        <spirit:name>xilinx_implementation</spirit:name>
+        <spirit:displayName>Implementation</spirit:displayName>
+        <spirit:envIdentifier>:vivado.xilinx.com:implementation</spirit:envIdentifier>
+        <spirit:modelName>memstream_axi_wrapper</spirit:modelName>
+        <spirit:parameters>
+          <spirit:parameter>
+            <spirit:name>viewChecksum</spirit:name>
+            <spirit:value>cd434062</spirit:value>
           </spirit:parameter>
         </spirit:parameters>
       </spirit:view>
@@ -506,7 +322,7 @@
         <spirit:parameters>
           <spirit:parameter>
             <spirit:name>viewChecksum</spirit:name>
-            <spirit:value>f960907f</spirit:value>
+            <spirit:value>6c92393d</spirit:value>
           </spirit:parameter>
         </spirit:parameters>
       </spirit:view>
@@ -520,14 +336,14 @@
         <spirit:parameters>
           <spirit:parameter>
             <spirit:name>viewChecksum</spirit:name>
-            <spirit:value>d2aad2c5</spirit:value>
+            <spirit:value>923e7b90</spirit:value>
           </spirit:parameter>
         </spirit:parameters>
       </spirit:view>
     </spirit:views>
     <spirit:ports>
       <spirit:port>
-        <spirit:name>aclk</spirit:name>
+        <spirit:name>ap_clk</spirit:name>
         <spirit:wire>
           <spirit:direction>in</spirit:direction>
           <spirit:wireTypeDefs>
@@ -540,7 +356,7 @@
         </spirit:wire>
       </spirit:port>
       <spirit:port>
-        <spirit:name>aresetn</spirit:name>
+        <spirit:name>ap_rst_n</spirit:name>
         <spirit:wire>
           <spirit:direction>in</spirit:direction>
           <spirit:wireTypeDefs>
@@ -582,11 +398,11 @@
         </spirit:wire>
       </spirit:port>
       <spirit:port>
-        <spirit:name>awaddr</spirit:name>
+        <spirit:name>awprot</spirit:name>
         <spirit:wire>
           <spirit:direction>in</spirit:direction>
           <spirit:vector>
-            <spirit:left spirit:format="long" spirit:resolve="dependent" spirit:dependency="(spirit:decode(id(&apos;MODELPARAM_VALUE.AXILITE_ADDR_WIDTH&apos;)) - 1)">15</spirit:left>
+            <spirit:left spirit:format="long">2</spirit:left>
             <spirit:right spirit:format="long">0</spirit:right>
           </spirit:vector>
           <spirit:wireTypeDefs>
@@ -602,11 +418,11 @@
         </spirit:wire>
       </spirit:port>
       <spirit:port>
-        <spirit:name>awprot</spirit:name>
+        <spirit:name>awaddr</spirit:name>
         <spirit:wire>
           <spirit:direction>in</spirit:direction>
           <spirit:vector>
-            <spirit:left spirit:format="long">2</spirit:left>
+            <spirit:left spirit:format="long" spirit:resolve="dependent" spirit:dependency="(spirit:decode(id(&apos;MODELPARAM_VALUE.AXILITE_ADDR_WIDTH&apos;)) - 1)">10</spirit:left>
             <spirit:right spirit:format="long">0</spirit:right>
           </spirit:vector>
           <spirit:wireTypeDefs>
@@ -766,11 +582,11 @@
         </spirit:wire>
       </spirit:port>
       <spirit:port>
-        <spirit:name>araddr</spirit:name>
+        <spirit:name>arprot</spirit:name>
         <spirit:wire>
           <spirit:direction>in</spirit:direction>
           <spirit:vector>
-            <spirit:left spirit:format="long" spirit:resolve="dependent" spirit:dependency="(spirit:decode(id(&apos;MODELPARAM_VALUE.AXILITE_ADDR_WIDTH&apos;)) - 1)">15</spirit:left>
+            <spirit:left spirit:format="long">2</spirit:left>
             <spirit:right spirit:format="long">0</spirit:right>
           </spirit:vector>
           <spirit:wireTypeDefs>
@@ -786,11 +602,11 @@
         </spirit:wire>
       </spirit:port>
       <spirit:port>
-        <spirit:name>arprot</spirit:name>
+        <spirit:name>araddr</spirit:name>
         <spirit:wire>
           <spirit:direction>in</spirit:direction>
           <spirit:vector>
-            <spirit:left spirit:format="long">2</spirit:left>
+            <spirit:left spirit:format="long" spirit:resolve="dependent" spirit:dependency="(spirit:decode(id(&apos;MODELPARAM_VALUE.AXILITE_ADDR_WIDTH&apos;)) - 1)">10</spirit:left>
             <spirit:right spirit:format="long">0</spirit:right>
           </spirit:vector>
           <spirit:wireTypeDefs>
@@ -868,29 +684,6 @@
           </spirit:wireTypeDefs>
         </spirit:wire>
       </spirit:port>
-      <spirit:port>
-        <spirit:name>m_axis_0_afull</spirit:name>
-        <spirit:wire>
-          <spirit:direction>in</spirit:direction>
-          <spirit:wireTypeDefs>
-            <spirit:wireTypeDef>
-              <spirit:typeName>std_logic</spirit:typeName>
-              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
-              <spirit:viewNameRef>xilinx_anylanguagebehavioralsimulation</spirit:viewNameRef>
-            </spirit:wireTypeDef>
-          </spirit:wireTypeDefs>
-          <spirit:driver>
-            <spirit:defaultValue spirit:format="long">0</spirit:defaultValue>
-          </spirit:driver>
-        </spirit:wire>
-        <spirit:vendorExtensions>
-          <xilinx:portInfo>
-            <xilinx:enablement>
-              <xilinx:isEnabled xilinx:resolve="dependent" xilinx:id="PORT_ENABLEMENT.m_axis_0_afull" xilinx:dependency="$NSTREAMS>2">true</xilinx:isEnabled>
-            </xilinx:enablement>
-          </xilinx:portInfo>
-        </spirit:vendorExtensions>
-      </spirit:port>
       <spirit:port>
         <spirit:name>m_axis_0_tready</spirit:name>
         <spirit:wire>
@@ -925,352 +718,7 @@
         <spirit:wire>
           <spirit:direction>out</spirit:direction>
           <spirit:vector>
-            <spirit:left spirit:format="long" spirit:resolve="dependent" spirit:dependency="((((spirit:decode(id(&apos;MODELPARAM_VALUE.STRM0_WIDTH&apos;)) + 7) / 8) * 8) - 1)">31</spirit:left>
-            <spirit:right spirit:format="long">0</spirit:right>
-          </spirit:vector>
-          <spirit:wireTypeDefs>
-            <spirit:wireTypeDef>
-              <spirit:typeName>std_logic_vector</spirit:typeName>
-              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
-              <spirit:viewNameRef>xilinx_anylanguagebehavioralsimulation</spirit:viewNameRef>
-            </spirit:wireTypeDef>
-          </spirit:wireTypeDefs>
-        </spirit:wire>
-      </spirit:port>
-      <spirit:port>
-        <spirit:name>m_axis_1_afull</spirit:name>
-        <spirit:wire>
-          <spirit:direction>in</spirit:direction>
-          <spirit:wireTypeDefs>
-            <spirit:wireTypeDef>
-              <spirit:typeName>std_logic</spirit:typeName>
-              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
-              <spirit:viewNameRef>xilinx_anylanguagebehavioralsimulation</spirit:viewNameRef>
-            </spirit:wireTypeDef>
-          </spirit:wireTypeDefs>
-          <spirit:driver>
-            <spirit:defaultValue spirit:format="long">0</spirit:defaultValue>
-          </spirit:driver>
-        </spirit:wire>
-        <spirit:vendorExtensions>
-          <xilinx:portInfo>
-            <xilinx:enablement>
-              <xilinx:isEnabled xilinx:resolve="dependent" xilinx:id="PORT_ENABLEMENT.m_axis_1_afull" xilinx:dependency="$NSTREAMS>2">true</xilinx:isEnabled>
-            </xilinx:enablement>
-          </xilinx:portInfo>
-        </spirit:vendorExtensions>
-      </spirit:port>
-      <spirit:port>
-        <spirit:name>m_axis_1_tready</spirit:name>
-        <spirit:wire>
-          <spirit:direction>in</spirit:direction>
-          <spirit:wireTypeDefs>
-            <spirit:wireTypeDef>
-              <spirit:typeName>std_logic</spirit:typeName>
-              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
-              <spirit:viewNameRef>xilinx_anylanguagebehavioralsimulation</spirit:viewNameRef>
-            </spirit:wireTypeDef>
-          </spirit:wireTypeDefs>
-          <spirit:driver>
-            <spirit:defaultValue spirit:format="long">1</spirit:defaultValue>
-          </spirit:driver>
-        </spirit:wire>
-      </spirit:port>
-      <spirit:port>
-        <spirit:name>m_axis_1_tvalid</spirit:name>
-        <spirit:wire>
-          <spirit:direction>out</spirit:direction>
-          <spirit:wireTypeDefs>
-            <spirit:wireTypeDef>
-              <spirit:typeName>std_logic</spirit:typeName>
-              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
-              <spirit:viewNameRef>xilinx_anylanguagebehavioralsimulation</spirit:viewNameRef>
-            </spirit:wireTypeDef>
-          </spirit:wireTypeDefs>
-        </spirit:wire>
-      </spirit:port>
-      <spirit:port>
-        <spirit:name>m_axis_1_tdata</spirit:name>
-        <spirit:wire>
-          <spirit:direction>out</spirit:direction>
-          <spirit:vector>
-            <spirit:left spirit:format="long" spirit:resolve="dependent" spirit:dependency="((((spirit:decode(id(&apos;MODELPARAM_VALUE.STRM1_WIDTH&apos;)) + 7) / 8) * 8) - 1)">31</spirit:left>
-            <spirit:right spirit:format="long">0</spirit:right>
-          </spirit:vector>
-          <spirit:wireTypeDefs>
-            <spirit:wireTypeDef>
-              <spirit:typeName>std_logic_vector</spirit:typeName>
-              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
-              <spirit:viewNameRef>xilinx_anylanguagebehavioralsimulation</spirit:viewNameRef>
-            </spirit:wireTypeDef>
-          </spirit:wireTypeDefs>
-        </spirit:wire>
-      </spirit:port>
-      <spirit:port>
-        <spirit:name>m_axis_2_afull</spirit:name>
-        <spirit:wire>
-          <spirit:direction>in</spirit:direction>
-          <spirit:wireTypeDefs>
-            <spirit:wireTypeDef>
-              <spirit:typeName>std_logic</spirit:typeName>
-              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
-              <spirit:viewNameRef>xilinx_anylanguagebehavioralsimulation</spirit:viewNameRef>
-            </spirit:wireTypeDef>
-          </spirit:wireTypeDefs>
-          <spirit:driver>
-            <spirit:defaultValue spirit:format="long">0</spirit:defaultValue>
-          </spirit:driver>
-        </spirit:wire>
-        <spirit:vendorExtensions>
-          <xilinx:portInfo>
-            <xilinx:enablement>
-              <xilinx:isEnabled xilinx:resolve="dependent" xilinx:id="PORT_ENABLEMENT.m_axis_2_afull" xilinx:dependency="$NSTREAMS>=3">true</xilinx:isEnabled>
-            </xilinx:enablement>
-          </xilinx:portInfo>
-        </spirit:vendorExtensions>
-      </spirit:port>
-      <spirit:port>
-        <spirit:name>m_axis_2_tready</spirit:name>
-        <spirit:wire>
-          <spirit:direction>in</spirit:direction>
-          <spirit:wireTypeDefs>
-            <spirit:wireTypeDef>
-              <spirit:typeName>std_logic</spirit:typeName>
-              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
-              <spirit:viewNameRef>xilinx_anylanguagebehavioralsimulation</spirit:viewNameRef>
-            </spirit:wireTypeDef>
-          </spirit:wireTypeDefs>
-          <spirit:driver>
-            <spirit:defaultValue spirit:format="long">1</spirit:defaultValue>
-          </spirit:driver>
-        </spirit:wire>
-      </spirit:port>
-      <spirit:port>
-        <spirit:name>m_axis_2_tvalid</spirit:name>
-        <spirit:wire>
-          <spirit:direction>out</spirit:direction>
-          <spirit:wireTypeDefs>
-            <spirit:wireTypeDef>
-              <spirit:typeName>std_logic</spirit:typeName>
-              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
-              <spirit:viewNameRef>xilinx_anylanguagebehavioralsimulation</spirit:viewNameRef>
-            </spirit:wireTypeDef>
-          </spirit:wireTypeDefs>
-        </spirit:wire>
-      </spirit:port>
-      <spirit:port>
-        <spirit:name>m_axis_2_tdata</spirit:name>
-        <spirit:wire>
-          <spirit:direction>out</spirit:direction>
-          <spirit:vector>
-            <spirit:left spirit:format="long" spirit:resolve="dependent" spirit:dependency="((((spirit:decode(id(&apos;MODELPARAM_VALUE.STRM2_WIDTH&apos;)) + 7) / 8) * 8) - 1)">31</spirit:left>
-            <spirit:right spirit:format="long">0</spirit:right>
-          </spirit:vector>
-          <spirit:wireTypeDefs>
-            <spirit:wireTypeDef>
-              <spirit:typeName>std_logic_vector</spirit:typeName>
-              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
-              <spirit:viewNameRef>xilinx_anylanguagebehavioralsimulation</spirit:viewNameRef>
-            </spirit:wireTypeDef>
-          </spirit:wireTypeDefs>
-        </spirit:wire>
-      </spirit:port>
-      <spirit:port>
-        <spirit:name>m_axis_3_afull</spirit:name>
-        <spirit:wire>
-          <spirit:direction>in</spirit:direction>
-          <spirit:wireTypeDefs>
-            <spirit:wireTypeDef>
-              <spirit:typeName>std_logic</spirit:typeName>
-              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
-              <spirit:viewNameRef>xilinx_anylanguagebehavioralsimulation</spirit:viewNameRef>
-            </spirit:wireTypeDef>
-          </spirit:wireTypeDefs>
-          <spirit:driver>
-            <spirit:defaultValue spirit:format="long">0</spirit:defaultValue>
-          </spirit:driver>
-        </spirit:wire>
-        <spirit:vendorExtensions>
-          <xilinx:portInfo>
-            <xilinx:enablement>
-              <xilinx:isEnabled xilinx:resolve="dependent" xilinx:id="PORT_ENABLEMENT.m_axis_3_afull" xilinx:dependency="$NSTREAMS>=4">true</xilinx:isEnabled>
-            </xilinx:enablement>
-          </xilinx:portInfo>
-        </spirit:vendorExtensions>
-      </spirit:port>
-      <spirit:port>
-        <spirit:name>m_axis_3_tready</spirit:name>
-        <spirit:wire>
-          <spirit:direction>in</spirit:direction>
-          <spirit:wireTypeDefs>
-            <spirit:wireTypeDef>
-              <spirit:typeName>std_logic</spirit:typeName>
-              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
-              <spirit:viewNameRef>xilinx_anylanguagebehavioralsimulation</spirit:viewNameRef>
-            </spirit:wireTypeDef>
-          </spirit:wireTypeDefs>
-          <spirit:driver>
-            <spirit:defaultValue spirit:format="long">1</spirit:defaultValue>
-          </spirit:driver>
-        </spirit:wire>
-      </spirit:port>
-      <spirit:port>
-        <spirit:name>m_axis_3_tvalid</spirit:name>
-        <spirit:wire>
-          <spirit:direction>out</spirit:direction>
-          <spirit:wireTypeDefs>
-            <spirit:wireTypeDef>
-              <spirit:typeName>std_logic</spirit:typeName>
-              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
-              <spirit:viewNameRef>xilinx_anylanguagebehavioralsimulation</spirit:viewNameRef>
-            </spirit:wireTypeDef>
-          </spirit:wireTypeDefs>
-        </spirit:wire>
-      </spirit:port>
-      <spirit:port>
-        <spirit:name>m_axis_3_tdata</spirit:name>
-        <spirit:wire>
-          <spirit:direction>out</spirit:direction>
-          <spirit:vector>
-            <spirit:left spirit:format="long" spirit:resolve="dependent" spirit:dependency="((((spirit:decode(id(&apos;MODELPARAM_VALUE.STRM3_WIDTH&apos;)) + 7) / 8) * 8) - 1)">31</spirit:left>
-            <spirit:right spirit:format="long">0</spirit:right>
-          </spirit:vector>
-          <spirit:wireTypeDefs>
-            <spirit:wireTypeDef>
-              <spirit:typeName>std_logic_vector</spirit:typeName>
-              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
-              <spirit:viewNameRef>xilinx_anylanguagebehavioralsimulation</spirit:viewNameRef>
-            </spirit:wireTypeDef>
-          </spirit:wireTypeDefs>
-        </spirit:wire>
-      </spirit:port>
-      <spirit:port>
-        <spirit:name>m_axis_4_afull</spirit:name>
-        <spirit:wire>
-          <spirit:direction>in</spirit:direction>
-          <spirit:wireTypeDefs>
-            <spirit:wireTypeDef>
-              <spirit:typeName>std_logic</spirit:typeName>
-              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
-              <spirit:viewNameRef>xilinx_anylanguagebehavioralsimulation</spirit:viewNameRef>
-            </spirit:wireTypeDef>
-          </spirit:wireTypeDefs>
-          <spirit:driver>
-            <spirit:defaultValue spirit:format="long">0</spirit:defaultValue>
-          </spirit:driver>
-        </spirit:wire>
-        <spirit:vendorExtensions>
-          <xilinx:portInfo>
-            <xilinx:enablement>
-              <xilinx:isEnabled xilinx:resolve="dependent" xilinx:id="PORT_ENABLEMENT.m_axis_4_afull" xilinx:dependency="$NSTREAMS>=5">true</xilinx:isEnabled>
-            </xilinx:enablement>
-          </xilinx:portInfo>
-        </spirit:vendorExtensions>
-      </spirit:port>
-      <spirit:port>
-        <spirit:name>m_axis_4_tready</spirit:name>
-        <spirit:wire>
-          <spirit:direction>in</spirit:direction>
-          <spirit:wireTypeDefs>
-            <spirit:wireTypeDef>
-              <spirit:typeName>std_logic</spirit:typeName>
-              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
-              <spirit:viewNameRef>xilinx_anylanguagebehavioralsimulation</spirit:viewNameRef>
-            </spirit:wireTypeDef>
-          </spirit:wireTypeDefs>
-          <spirit:driver>
-            <spirit:defaultValue spirit:format="long">1</spirit:defaultValue>
-          </spirit:driver>
-        </spirit:wire>
-      </spirit:port>
-      <spirit:port>
-        <spirit:name>m_axis_4_tvalid</spirit:name>
-        <spirit:wire>
-          <spirit:direction>out</spirit:direction>
-          <spirit:wireTypeDefs>
-            <spirit:wireTypeDef>
-              <spirit:typeName>std_logic</spirit:typeName>
-              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
-              <spirit:viewNameRef>xilinx_anylanguagebehavioralsimulation</spirit:viewNameRef>
-            </spirit:wireTypeDef>
-          </spirit:wireTypeDefs>
-        </spirit:wire>
-      </spirit:port>
-      <spirit:port>
-        <spirit:name>m_axis_4_tdata</spirit:name>
-        <spirit:wire>
-          <spirit:direction>out</spirit:direction>
-          <spirit:vector>
-            <spirit:left spirit:format="long" spirit:resolve="dependent" spirit:dependency="((((spirit:decode(id(&apos;MODELPARAM_VALUE.STRM4_WIDTH&apos;)) + 7) / 8) * 8) - 1)">31</spirit:left>
-            <spirit:right spirit:format="long">0</spirit:right>
-          </spirit:vector>
-          <spirit:wireTypeDefs>
-            <spirit:wireTypeDef>
-              <spirit:typeName>std_logic_vector</spirit:typeName>
-              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
-              <spirit:viewNameRef>xilinx_anylanguagebehavioralsimulation</spirit:viewNameRef>
-            </spirit:wireTypeDef>
-          </spirit:wireTypeDefs>
-        </spirit:wire>
-      </spirit:port>
-      <spirit:port>
-        <spirit:name>m_axis_5_afull</spirit:name>
-        <spirit:wire>
-          <spirit:direction>in</spirit:direction>
-          <spirit:wireTypeDefs>
-            <spirit:wireTypeDef>
-              <spirit:typeName>std_logic</spirit:typeName>
-              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
-              <spirit:viewNameRef>xilinx_anylanguagebehavioralsimulation</spirit:viewNameRef>
-            </spirit:wireTypeDef>
-          </spirit:wireTypeDefs>
-          <spirit:driver>
-            <spirit:defaultValue spirit:format="long">0</spirit:defaultValue>
-          </spirit:driver>
-        </spirit:wire>
-        <spirit:vendorExtensions>
-          <xilinx:portInfo>
-            <xilinx:enablement>
-              <xilinx:isEnabled xilinx:resolve="dependent" xilinx:id="PORT_ENABLEMENT.m_axis_5_afull" xilinx:dependency="$NSTREAMS = 6">true</xilinx:isEnabled>
-            </xilinx:enablement>
-          </xilinx:portInfo>
-        </spirit:vendorExtensions>
-      </spirit:port>
-      <spirit:port>
-        <spirit:name>m_axis_5_tready</spirit:name>
-        <spirit:wire>
-          <spirit:direction>in</spirit:direction>
-          <spirit:wireTypeDefs>
-            <spirit:wireTypeDef>
-              <spirit:typeName>std_logic</spirit:typeName>
-              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
-              <spirit:viewNameRef>xilinx_anylanguagebehavioralsimulation</spirit:viewNameRef>
-            </spirit:wireTypeDef>
-          </spirit:wireTypeDefs>
-          <spirit:driver>
-            <spirit:defaultValue spirit:format="long">1</spirit:defaultValue>
-          </spirit:driver>
-        </spirit:wire>
-      </spirit:port>
-      <spirit:port>
-        <spirit:name>m_axis_5_tvalid</spirit:name>
-        <spirit:wire>
-          <spirit:direction>out</spirit:direction>
-          <spirit:wireTypeDefs>
-            <spirit:wireTypeDef>
-              <spirit:typeName>std_logic</spirit:typeName>
-              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
-              <spirit:viewNameRef>xilinx_anylanguagebehavioralsimulation</spirit:viewNameRef>
-            </spirit:wireTypeDef>
-          </spirit:wireTypeDefs>
-        </spirit:wire>
-      </spirit:port>
-      <spirit:port>
-        <spirit:name>m_axis_5_tdata</spirit:name>
-        <spirit:wire>
-          <spirit:direction>out</spirit:direction>
-          <spirit:vector>
-            <spirit:left spirit:format="long" spirit:resolve="dependent" spirit:dependency="((((spirit:decode(id(&apos;MODELPARAM_VALUE.STRM5_WIDTH&apos;)) + 7) / 8) * 8) - 1)">31</spirit:left>
+            <spirit:left spirit:format="long" spirit:resolve="dependent" spirit:dependency="((((spirit:decode(id(&apos;MODELPARAM_VALUE.WIDTH&apos;)) + 7) / 8) * 8) - 1)">31</spirit:left>
             <spirit:right spirit:format="long">0</spirit:right>
           </spirit:vector>
           <spirit:wireTypeDefs>
@@ -1285,129 +733,29 @@
     </spirit:ports>
     <spirit:modelParameters>
       <spirit:modelParameter xsi:type="spirit:nameValueTypeType" spirit:dataType="integer">
-        <spirit:name>CONFIG_EN</spirit:name>
-        <spirit:displayName>Config En</spirit:displayName>
-        <spirit:value spirit:format="bool" spirit:resolve="generated" spirit:id="MODELPARAM_VALUE.CONFIG_EN">true</spirit:value>
+        <spirit:name>DEPTH</spirit:name>
+        <spirit:displayName>Depth</spirit:displayName>
+        <spirit:value spirit:format="long" spirit:resolve="generated" spirit:id="MODELPARAM_VALUE.DEPTH">512</spirit:value>
       </spirit:modelParameter>
       <spirit:modelParameter spirit:dataType="integer">
-        <spirit:name>NSTREAMS</spirit:name>
-        <spirit:displayName>Nstreams</spirit:displayName>
-        <spirit:value spirit:format="long" spirit:resolve="generated" spirit:id="MODELPARAM_VALUE.NSTREAMS">6</spirit:value>
-      </spirit:modelParameter>
-      <spirit:modelParameter spirit:dataType="integer">
-        <spirit:name>MEM_DEPTH</spirit:name>
-        <spirit:displayName>Mem Depth</spirit:displayName>
-        <spirit:value spirit:format="long" spirit:resolve="generated" spirit:id="MODELPARAM_VALUE.MEM_DEPTH">13824</spirit:value>
-      </spirit:modelParameter>
-      <spirit:modelParameter spirit:dataType="integer">
-        <spirit:name>MEM_WIDTH</spirit:name>
-        <spirit:displayName>Mem Width</spirit:displayName>
-        <spirit:value spirit:format="long" spirit:resolve="generated" spirit:id="MODELPARAM_VALUE.MEM_WIDTH">32</spirit:value>
+        <spirit:name>WIDTH</spirit:name>
+        <spirit:displayName>Width</spirit:displayName>
+        <spirit:value spirit:format="long" spirit:resolve="generated" spirit:id="MODELPARAM_VALUE.WIDTH">32</spirit:value>
       </spirit:modelParameter>
       <spirit:modelParameter spirit:dataType="string">
-        <spirit:name>MEM_INIT</spirit:name>
-        <spirit:displayName>Mem Init</spirit:displayName>
-        <spirit:value spirit:resolve="generated" spirit:id="MODELPARAM_VALUE.MEM_INIT">./</spirit:value>
+        <spirit:name>INIT_FILE</spirit:name>
+        <spirit:displayName>Init File</spirit:displayName>
+        <spirit:value spirit:resolve="generated" spirit:id="MODELPARAM_VALUE.INIT_FILE"/>
       </spirit:modelParameter>
       <spirit:modelParameter spirit:dataType="string">
         <spirit:name>RAM_STYLE</spirit:name>
         <spirit:displayName>Ram Style</spirit:displayName>
         <spirit:value spirit:resolve="generated" spirit:id="MODELPARAM_VALUE.RAM_STYLE">auto</spirit:value>
       </spirit:modelParameter>
-      <spirit:modelParameter spirit:dataType="integer">
-        <spirit:name>STRM0_WIDTH</spirit:name>
-        <spirit:displayName>Strm0 Width</spirit:displayName>
-        <spirit:value spirit:format="long" spirit:resolve="generated" spirit:id="MODELPARAM_VALUE.STRM0_WIDTH">32</spirit:value>
-      </spirit:modelParameter>
-      <spirit:modelParameter spirit:dataType="integer">
-        <spirit:name>STRM1_WIDTH</spirit:name>
-        <spirit:displayName>Strm1 Width</spirit:displayName>
-        <spirit:value spirit:format="long" spirit:resolve="generated" spirit:id="MODELPARAM_VALUE.STRM1_WIDTH">32</spirit:value>
-      </spirit:modelParameter>
-      <spirit:modelParameter spirit:dataType="integer">
-        <spirit:name>STRM2_WIDTH</spirit:name>
-        <spirit:displayName>Strm2 Width</spirit:displayName>
-        <spirit:value spirit:format="long" spirit:resolve="generated" spirit:id="MODELPARAM_VALUE.STRM2_WIDTH">32</spirit:value>
-      </spirit:modelParameter>
-      <spirit:modelParameter spirit:dataType="integer">
-        <spirit:name>STRM3_WIDTH</spirit:name>
-        <spirit:displayName>Strm3 Width</spirit:displayName>
-        <spirit:value spirit:format="long" spirit:resolve="generated" spirit:id="MODELPARAM_VALUE.STRM3_WIDTH">32</spirit:value>
-      </spirit:modelParameter>
-      <spirit:modelParameter spirit:dataType="integer">
-        <spirit:name>STRM4_WIDTH</spirit:name>
-        <spirit:displayName>Strm4 Width</spirit:displayName>
-        <spirit:value spirit:format="long" spirit:resolve="generated" spirit:id="MODELPARAM_VALUE.STRM4_WIDTH">32</spirit:value>
-      </spirit:modelParameter>
-      <spirit:modelParameter spirit:dataType="integer">
-        <spirit:name>STRM5_WIDTH</spirit:name>
-        <spirit:displayName>Strm5 Width</spirit:displayName>
-        <spirit:value spirit:format="long" spirit:resolve="generated" spirit:id="MODELPARAM_VALUE.STRM5_WIDTH">32</spirit:value>
-      </spirit:modelParameter>
-      <spirit:modelParameter spirit:dataType="integer">
-        <spirit:name>STRM0_DEPTH</spirit:name>
-        <spirit:displayName>Strm0 Depth</spirit:displayName>
-        <spirit:value spirit:format="long" spirit:resolve="generated" spirit:id="MODELPARAM_VALUE.STRM0_DEPTH">2304</spirit:value>
-      </spirit:modelParameter>
-      <spirit:modelParameter spirit:dataType="integer">
-        <spirit:name>STRM1_DEPTH</spirit:name>
-        <spirit:displayName>Strm1 Depth</spirit:displayName>
-        <spirit:value spirit:format="long" spirit:resolve="generated" spirit:id="MODELPARAM_VALUE.STRM1_DEPTH">2304</spirit:value>
-      </spirit:modelParameter>
-      <spirit:modelParameter spirit:dataType="integer">
-        <spirit:name>STRM2_DEPTH</spirit:name>
-        <spirit:displayName>Strm2 Depth</spirit:displayName>
-        <spirit:value spirit:format="long" spirit:resolve="generated" spirit:id="MODELPARAM_VALUE.STRM2_DEPTH">2304</spirit:value>
-      </spirit:modelParameter>
-      <spirit:modelParameter spirit:dataType="integer">
-        <spirit:name>STRM3_DEPTH</spirit:name>
-        <spirit:displayName>Strm3 Depth</spirit:displayName>
-        <spirit:value spirit:format="long" spirit:resolve="generated" spirit:id="MODELPARAM_VALUE.STRM3_DEPTH">2304</spirit:value>
-      </spirit:modelParameter>
-      <spirit:modelParameter spirit:dataType="integer">
-        <spirit:name>STRM4_DEPTH</spirit:name>
-        <spirit:displayName>Strm4 Depth</spirit:displayName>
-        <spirit:value spirit:format="long" spirit:resolve="generated" spirit:id="MODELPARAM_VALUE.STRM4_DEPTH">2304</spirit:value>
-      </spirit:modelParameter>
-      <spirit:modelParameter spirit:dataType="integer">
-        <spirit:name>STRM5_DEPTH</spirit:name>
-        <spirit:displayName>Strm5 Depth</spirit:displayName>
-        <spirit:value spirit:format="long" spirit:resolve="generated" spirit:id="MODELPARAM_VALUE.STRM5_DEPTH">2304</spirit:value>
-      </spirit:modelParameter>
-      <spirit:modelParameter spirit:dataType="integer">
-        <spirit:name>STRM0_OFFSET</spirit:name>
-        <spirit:displayName>Strm0 Offset</spirit:displayName>
-        <spirit:value spirit:format="long" spirit:resolve="generated" spirit:id="MODELPARAM_VALUE.STRM0_OFFSET">0</spirit:value>
-      </spirit:modelParameter>
-      <spirit:modelParameter spirit:dataType="integer">
-        <spirit:name>STRM1_OFFSET</spirit:name>
-        <spirit:displayName>Strm1 Offset</spirit:displayName>
-        <spirit:value spirit:format="long" spirit:resolve="generated" spirit:id="MODELPARAM_VALUE.STRM1_OFFSET">2304</spirit:value>
-      </spirit:modelParameter>
-      <spirit:modelParameter spirit:dataType="integer">
-        <spirit:name>STRM2_OFFSET</spirit:name>
-        <spirit:displayName>Strm2 Offset</spirit:displayName>
-        <spirit:value spirit:format="long" spirit:resolve="generated" spirit:id="MODELPARAM_VALUE.STRM2_OFFSET">4608</spirit:value>
-      </spirit:modelParameter>
-      <spirit:modelParameter spirit:dataType="integer">
-        <spirit:name>STRM3_OFFSET</spirit:name>
-        <spirit:displayName>Strm3 Offset</spirit:displayName>
-        <spirit:value spirit:format="long" spirit:resolve="generated" spirit:id="MODELPARAM_VALUE.STRM3_OFFSET">6912</spirit:value>
-      </spirit:modelParameter>
-      <spirit:modelParameter spirit:dataType="integer">
-        <spirit:name>STRM4_OFFSET</spirit:name>
-        <spirit:displayName>Strm4 Offset</spirit:displayName>
-        <spirit:value spirit:format="long" spirit:resolve="generated" spirit:id="MODELPARAM_VALUE.STRM4_OFFSET">9216</spirit:value>
-      </spirit:modelParameter>
-      <spirit:modelParameter spirit:dataType="integer">
-        <spirit:name>STRM5_OFFSET</spirit:name>
-        <spirit:displayName>Strm5 Offset</spirit:displayName>
-        <spirit:value spirit:format="long" spirit:resolve="generated" spirit:id="MODELPARAM_VALUE.STRM5_OFFSET">11520</spirit:value>
-      </spirit:modelParameter>
       <spirit:modelParameter spirit:dataType="integer">
         <spirit:name>AXILITE_ADDR_WIDTH</spirit:name>
         <spirit:displayName>Axilite Addr Width</spirit:displayName>
-        <spirit:value spirit:format="long" spirit:resolve="generated" spirit:id="MODELPARAM_VALUE.AXILITE_ADDR_WIDTH" spirit:dependency="(2 + spirit:ceil(spirit:log(2,(spirit:decode(id(&apos;MODELPARAM_VALUE.MEM_DEPTH&apos;)) * (1 &lt;&lt; spirit:ceil(spirit:log(2,((spirit:decode(id(&apos;MODELPARAM_VALUE.MEM_WIDTH&apos;)) + 31) / 32))))))))">16</spirit:value>
+        <spirit:value spirit:format="long" spirit:resolve="generated" spirit:id="MODELPARAM_VALUE.AXILITE_ADDR_WIDTH" spirit:dependency="(spirit:ceil(spirit:log(2,(spirit:decode(id(&apos;MODELPARAM_VALUE.DEPTH&apos;)) * (2 ** spirit:ceil(spirit:log(2,((spirit:decode(id(&apos;MODELPARAM_VALUE.WIDTH&apos;)) + 31) / 32))))))) + 2)">11</spirit:value>
       </spirit:modelParameter>
     </spirit:modelParameters>
   </spirit:model>
@@ -1417,13 +765,6 @@
       <spirit:enumeration>ACTIVE_HIGH</spirit:enumeration>
       <spirit:enumeration>ACTIVE_LOW</spirit:enumeration>
     </spirit:choice>
-    <spirit:choice>
-      <spirit:name>choice_list_e2bd1cd0</spirit:name>
-      <spirit:enumeration>auto</spirit:enumeration>
-      <spirit:enumeration>distributed</spirit:enumeration>
-      <spirit:enumeration>block</spirit:enumeration>
-      <spirit:enumeration>ultra</spirit:enumeration>
-    </spirit:choice>
   </spirit:choices>
   <spirit:fileSets>
     <spirit:fileSet>
@@ -1433,71 +774,41 @@
         <spirit:fileType>verilogSource</spirit:fileType>
       </spirit:file>
       <spirit:file>
-        <spirit:name>hdl/memstream.v</spirit:name>
-        <spirit:fileType>verilogSource</spirit:fileType>
+        <spirit:name>hdl/memstream.sv</spirit:name>
+        <spirit:fileType>systemVerilogSource</spirit:fileType>
       </spirit:file>
       <spirit:file>
-        <spirit:name>hdl/memstream_multiblock.v</spirit:name>
-        <spirit:fileType>verilogSource</spirit:fileType>
+        <spirit:name>hdl/memstream_axi.sv</spirit:name>
+        <spirit:fileType>systemVerilogSource</spirit:fileType>
       </spirit:file>
       <spirit:file>
-        <spirit:name>hdl/memstream_singleblock.v</spirit:name>
+        <spirit:name>hdl/memstream_axi_wrapper.v</spirit:name>
         <spirit:fileType>verilogSource</spirit:fileType>
-      </spirit:file>
-      <spirit:file>
-        <spirit:name>hdl/mux.v</spirit:name>
-        <spirit:fileType>verilogSource</spirit:fileType>
-      </spirit:file>
-      <spirit:file>
-        <spirit:name>hdl/ramb18_sdp.v</spirit:name>
-        <spirit:fileType>verilogSource</spirit:fileType>
-      </spirit:file>
-      <spirit:file>
-        <spirit:name>hdl/ramb18_wf_dualport.v</spirit:name>
-        <spirit:fileType>verilogSource</spirit:fileType>
-        <spirit:userFileType>CHECKSUM_9425c051</spirit:userFileType>
+        <spirit:userFileType>CHECKSUM_7caabca7</spirit:userFileType>
       </spirit:file>
     </spirit:fileSet>
     <spirit:fileSet>
       <spirit:name>xilinx_anylanguagebehavioralsimulation_view_fileset</spirit:name>
       <spirit:file>
-        <spirit:name>hdl/memstream.v</spirit:name>
-        <spirit:fileType>verilogSource</spirit:fileType>
+        <spirit:name>hdl/memstream.sv</spirit:name>
+        <spirit:fileType>systemVerilogSource</spirit:fileType>
         <spirit:userFileType>USED_IN_ipstatic</spirit:userFileType>
         <spirit:logicalName>xil_defaultlib</spirit:logicalName>
       </spirit:file>
       <spirit:file>
-        <spirit:name>hdl/axilite_if.v</spirit:name>
-        <spirit:fileType>verilogSource</spirit:fileType>
+        <spirit:name>hdl/memstream_axi.sv</spirit:name>
+        <spirit:fileType>systemVerilogSource</spirit:fileType>
         <spirit:userFileType>USED_IN_ipstatic</spirit:userFileType>
         <spirit:logicalName>xil_defaultlib</spirit:logicalName>
       </spirit:file>
       <spirit:file>
-        <spirit:name>hdl/memstream_singleblock.v</spirit:name>
-        <spirit:fileType>verilogSource</spirit:fileType>
-        <spirit:userFileType>USED_IN_ipstatic</spirit:userFileType>
-        <spirit:logicalName>xil_defaultlib</spirit:logicalName>
-      </spirit:file>
-      <spirit:file>
-        <spirit:name>hdl/mux.v</spirit:name>
-        <spirit:fileType>verilogSource</spirit:fileType>
-        <spirit:userFileType>USED_IN_ipstatic</spirit:userFileType>
-        <spirit:logicalName>xil_defaultlib</spirit:logicalName>
-      </spirit:file>
-      <spirit:file>
-        <spirit:name>hdl/ramb18_wf_dualport.v</spirit:name>
-        <spirit:fileType>verilogSource</spirit:fileType>
-        <spirit:userFileType>USED_IN_ipstatic</spirit:userFileType>
-        <spirit:logicalName>xil_defaultlib</spirit:logicalName>
-      </spirit:file>
-      <spirit:file>
-        <spirit:name>hdl/memstream_multiblock.v</spirit:name>
+        <spirit:name>hdl/axilite_if.v</spirit:name>
         <spirit:fileType>verilogSource</spirit:fileType>
         <spirit:userFileType>USED_IN_ipstatic</spirit:userFileType>
         <spirit:logicalName>xil_defaultlib</spirit:logicalName>
       </spirit:file>
       <spirit:file>
-        <spirit:name>hdl/ramb18_sdp.v</spirit:name>
+        <spirit:name>hdl/memstream_axi_wrapper.v</spirit:name>
         <spirit:fileType>verilogSource</spirit:fileType>
         <spirit:userFileType>USED_IN_ipstatic</spirit:userFileType>
         <spirit:logicalName>xil_defaultlib</spirit:logicalName>
@@ -1508,7 +819,7 @@
       <spirit:file>
         <spirit:name>xgui/memstream_v1_0.tcl</spirit:name>
         <spirit:fileType>tclSource</spirit:fileType>
-        <spirit:userFileType>CHECKSUM_f960907f</spirit:userFileType>
+        <spirit:userFileType>CHECKSUM_32cad48d</spirit:userFileType>
         <spirit:userFileType>XGUI_VERSION_2</spirit:userFileType>
       </spirit:file>
     </spirit:fileSet>
@@ -1520,132 +831,32 @@
       </spirit:file>
     </spirit:fileSet>
   </spirit:fileSets>
-  <spirit:description>memstream_v1_0</spirit:description>
+  <spirit:description>memstream</spirit:description>
   <spirit:parameters>
     <spirit:parameter>
-      <spirit:name>CONFIG_EN</spirit:name>
-      <spirit:displayName>Config En</spirit:displayName>
-      <spirit:value spirit:format="bool" spirit:resolve="user" spirit:id="PARAM_VALUE.CONFIG_EN">true</spirit:value>
-    </spirit:parameter>
-    <spirit:parameter>
-      <spirit:name>NSTREAMS</spirit:name>
-      <spirit:displayName>Nstreams</spirit:displayName>
-      <spirit:value spirit:format="long" spirit:resolve="user" spirit:id="PARAM_VALUE.NSTREAMS" spirit:minimum="1" spirit:maximum="6" spirit:rangeType="long">6</spirit:value>
-    </spirit:parameter>
-    <spirit:parameter>
-      <spirit:name>MEM_DEPTH</spirit:name>
-      <spirit:displayName>Mem Depth</spirit:displayName>
-      <spirit:value spirit:format="long" spirit:resolve="user" spirit:id="PARAM_VALUE.MEM_DEPTH">13824</spirit:value>
+      <spirit:name>DEPTH</spirit:name>
+      <spirit:displayName>Depth</spirit:displayName>
+      <spirit:value spirit:format="long" spirit:resolve="user" spirit:id="PARAM_VALUE.DEPTH">512</spirit:value>
     </spirit:parameter>
     <spirit:parameter>
-      <spirit:name>MEM_WIDTH</spirit:name>
-      <spirit:displayName>Mem Width</spirit:displayName>
-      <spirit:value spirit:format="long" spirit:resolve="user" spirit:id="PARAM_VALUE.MEM_WIDTH">32</spirit:value>
+      <spirit:name>WIDTH</spirit:name>
+      <spirit:displayName>Width</spirit:displayName>
+      <spirit:value spirit:format="long" spirit:resolve="user" spirit:id="PARAM_VALUE.WIDTH">32</spirit:value>
     </spirit:parameter>
     <spirit:parameter>
-      <spirit:name>MEM_INIT</spirit:name>
-      <spirit:displayName>Mem Init</spirit:displayName>
-      <spirit:value spirit:resolve="user" spirit:id="PARAM_VALUE.MEM_INIT">./</spirit:value>
+      <spirit:name>INIT_FILE</spirit:name>
+      <spirit:displayName>Init File</spirit:displayName>
+      <spirit:value spirit:resolve="user" spirit:id="PARAM_VALUE.INIT_FILE"/>
     </spirit:parameter>
     <spirit:parameter>
       <spirit:name>RAM_STYLE</spirit:name>
       <spirit:displayName>Ram Style</spirit:displayName>
-      <spirit:value spirit:resolve="user" spirit:id="PARAM_VALUE.RAM_STYLE" spirit:choiceRef="choice_list_e2bd1cd0">auto</spirit:value>
-    </spirit:parameter>
-    <spirit:parameter>
-      <spirit:name>STRM0_WIDTH</spirit:name>
-      <spirit:displayName>Strm0 Width</spirit:displayName>
-      <spirit:value spirit:format="long" spirit:resolve="user" spirit:id="PARAM_VALUE.STRM0_WIDTH">32</spirit:value>
-    </spirit:parameter>
-    <spirit:parameter>
-      <spirit:name>STRM1_WIDTH</spirit:name>
-      <spirit:displayName>Strm1 Width</spirit:displayName>
-      <spirit:value spirit:format="long" spirit:resolve="user" spirit:id="PARAM_VALUE.STRM1_WIDTH">32</spirit:value>
-    </spirit:parameter>
-    <spirit:parameter>
-      <spirit:name>STRM2_WIDTH</spirit:name>
-      <spirit:displayName>Strm2 Width</spirit:displayName>
-      <spirit:value spirit:format="long" spirit:resolve="user" spirit:id="PARAM_VALUE.STRM2_WIDTH">32</spirit:value>
-    </spirit:parameter>
-    <spirit:parameter>
-      <spirit:name>STRM3_WIDTH</spirit:name>
-      <spirit:displayName>Strm3 Width</spirit:displayName>
-      <spirit:value spirit:format="long" spirit:resolve="user" spirit:id="PARAM_VALUE.STRM3_WIDTH">32</spirit:value>
-    </spirit:parameter>
-    <spirit:parameter>
-      <spirit:name>STRM4_WIDTH</spirit:name>
-      <spirit:displayName>Strm4 Width</spirit:displayName>
-      <spirit:value spirit:format="long" spirit:resolve="user" spirit:id="PARAM_VALUE.STRM4_WIDTH">32</spirit:value>
-    </spirit:parameter>
-    <spirit:parameter>
-      <spirit:name>STRM5_WIDTH</spirit:name>
-      <spirit:displayName>Strm5 Width</spirit:displayName>
-      <spirit:value spirit:format="long" spirit:resolve="user" spirit:id="PARAM_VALUE.STRM5_WIDTH">32</spirit:value>
-    </spirit:parameter>
-    <spirit:parameter>
-      <spirit:name>STRM0_DEPTH</spirit:name>
-      <spirit:displayName>Strm0 Depth</spirit:displayName>
-      <spirit:value spirit:format="long" spirit:resolve="user" spirit:id="PARAM_VALUE.STRM0_DEPTH">2304</spirit:value>
-    </spirit:parameter>
-    <spirit:parameter>
-      <spirit:name>STRM1_DEPTH</spirit:name>
-      <spirit:displayName>Strm1 Depth</spirit:displayName>
-      <spirit:value spirit:format="long" spirit:resolve="user" spirit:id="PARAM_VALUE.STRM1_DEPTH">2304</spirit:value>
-    </spirit:parameter>
-    <spirit:parameter>
-      <spirit:name>STRM2_DEPTH</spirit:name>
-      <spirit:displayName>Strm2 Depth</spirit:displayName>
-      <spirit:value spirit:format="long" spirit:resolve="user" spirit:id="PARAM_VALUE.STRM2_DEPTH">2304</spirit:value>
-    </spirit:parameter>
-    <spirit:parameter>
-      <spirit:name>STRM3_DEPTH</spirit:name>
-      <spirit:displayName>Strm3 Depth</spirit:displayName>
-      <spirit:value spirit:format="long" spirit:resolve="user" spirit:id="PARAM_VALUE.STRM3_DEPTH">2304</spirit:value>
-    </spirit:parameter>
-    <spirit:parameter>
-      <spirit:name>STRM4_DEPTH</spirit:name>
-      <spirit:displayName>Strm4 Depth</spirit:displayName>
-      <spirit:value spirit:format="long" spirit:resolve="user" spirit:id="PARAM_VALUE.STRM4_DEPTH">2304</spirit:value>
-    </spirit:parameter>
-    <spirit:parameter>
-      <spirit:name>STRM5_DEPTH</spirit:name>
-      <spirit:displayName>Strm5 Depth</spirit:displayName>
-      <spirit:value spirit:format="long" spirit:resolve="user" spirit:id="PARAM_VALUE.STRM5_DEPTH">2304</spirit:value>
-    </spirit:parameter>
-    <spirit:parameter>
-      <spirit:name>STRM0_OFFSET</spirit:name>
-      <spirit:displayName>Strm0 Offset</spirit:displayName>
-      <spirit:value spirit:format="long" spirit:resolve="user" spirit:id="PARAM_VALUE.STRM0_OFFSET">0</spirit:value>
-    </spirit:parameter>
-    <spirit:parameter>
-      <spirit:name>STRM1_OFFSET</spirit:name>
-      <spirit:displayName>Strm1 Offset</spirit:displayName>
-      <spirit:value spirit:format="long" spirit:resolve="user" spirit:id="PARAM_VALUE.STRM1_OFFSET">2304</spirit:value>
-    </spirit:parameter>
-    <spirit:parameter>
-      <spirit:name>STRM2_OFFSET</spirit:name>
-      <spirit:displayName>Strm2 Offset</spirit:displayName>
-      <spirit:value spirit:format="long" spirit:resolve="user" spirit:id="PARAM_VALUE.STRM2_OFFSET">4608</spirit:value>
-    </spirit:parameter>
-    <spirit:parameter>
-      <spirit:name>STRM3_OFFSET</spirit:name>
-      <spirit:displayName>Strm3 Offset</spirit:displayName>
-      <spirit:value spirit:format="long" spirit:resolve="user" spirit:id="PARAM_VALUE.STRM3_OFFSET">6912</spirit:value>
-    </spirit:parameter>
-    <spirit:parameter>
-      <spirit:name>STRM4_OFFSET</spirit:name>
-      <spirit:displayName>Strm4 Offset</spirit:displayName>
-      <spirit:value spirit:format="long" spirit:resolve="user" spirit:id="PARAM_VALUE.STRM4_OFFSET">9216</spirit:value>
-    </spirit:parameter>
-    <spirit:parameter>
-      <spirit:name>STRM5_OFFSET</spirit:name>
-      <spirit:displayName>Strm5 Offset</spirit:displayName>
-      <spirit:value spirit:format="long" spirit:resolve="user" spirit:id="PARAM_VALUE.STRM5_OFFSET">11520</spirit:value>
+      <spirit:value spirit:resolve="user" spirit:id="PARAM_VALUE.RAM_STYLE">auto</spirit:value>
     </spirit:parameter>
     <spirit:parameter>
       <spirit:name>AXILITE_ADDR_WIDTH</spirit:name>
       <spirit:displayName>Axilite Addr Width</spirit:displayName>
-      <spirit:value spirit:format="long" spirit:resolve="user" spirit:id="PARAM_VALUE.AXILITE_ADDR_WIDTH">16</spirit:value>
+      <spirit:value spirit:format="long" spirit:resolve="user" spirit:id="PARAM_VALUE.AXILITE_ADDR_WIDTH">11</spirit:value>
       <spirit:vendorExtensions>
         <xilinx:parameterInfo>
           <xilinx:enablement>
@@ -1656,52 +867,40 @@
     </spirit:parameter>
     <spirit:parameter>
       <spirit:name>Component_Name</spirit:name>
-      <spirit:value spirit:resolve="user" spirit:id="PARAM_VALUE.Component_Name" spirit:order="1">memstream_v1_0</spirit:value>
+      <spirit:value spirit:resolve="user" spirit:id="PARAM_VALUE.Component_Name" spirit:order="1">memstream_axi_wrapper_v1_0</spirit:value>
     </spirit:parameter>
   </spirit:parameters>
   <spirit:vendorExtensions>
     <xilinx:coreExtensions>
-      <xilinx:supportedFamilies>
-        <xilinx:family xilinx:lifeCycle="Beta">aartix7</xilinx:family>
-        <xilinx:family xilinx:lifeCycle="Beta">akintex7</xilinx:family>
-        <xilinx:family xilinx:lifeCycle="Beta">artix7</xilinx:family>
-        <xilinx:family xilinx:lifeCycle="Beta">artix7l</xilinx:family>
-        <xilinx:family xilinx:lifeCycle="Beta">azynq</xilinx:family>
-        <xilinx:family xilinx:lifeCycle="Production">kintex7</xilinx:family>
-        <xilinx:family xilinx:lifeCycle="Production">kintex7l</xilinx:family>
-        <xilinx:family xilinx:lifeCycle="Production">kintexu</xilinx:family>
-        <xilinx:family xilinx:lifeCycle="Production">kintexuplus</xilinx:family>
-        <xilinx:family xilinx:lifeCycle="Production">qkintex7</xilinx:family>
-        <xilinx:family xilinx:lifeCycle="Production">qkintex7l</xilinx:family>
-        <xilinx:family xilinx:lifeCycle="Production">qvirtex7</xilinx:family>
-        <xilinx:family xilinx:lifeCycle="Production">qzynq</xilinx:family>
-        <xilinx:family xilinx:lifeCycle="Production">qzynqplus</xilinx:family>
-        <xilinx:family xilinx:lifeCycle="Production">versal</xilinx:family>
-        <xilinx:family xilinx:lifeCycle="Production">versalprime</xilinx:family>
-        <xilinx:family xilinx:lifeCycle="Production">virtex7</xilinx:family>
-        <xilinx:family xilinx:lifeCycle="Production">virtexu</xilinx:family>
-        <xilinx:family xilinx:lifeCycle="Production">virtexuplus</xilinx:family>
-        <xilinx:family xilinx:lifeCycle="Production">virtexuplusHBM</xilinx:family>
-        <xilinx:family xilinx:lifeCycle="Production">virtexupluse58g</xilinx:family>
-        <xilinx:family xilinx:lifeCycle="Production">zynq</xilinx:family>
-        <xilinx:family xilinx:lifeCycle="Production">zynquplus</xilinx:family>
-      </xilinx:supportedFamilies>
       <xilinx:taxonomies>
         <xilinx:taxonomy>/UserIP</xilinx:taxonomy>
       </xilinx:taxonomies>
-      <xilinx:displayName>memstream_v1_0</xilinx:displayName>
+      <xilinx:displayName>memstream</xilinx:displayName>
+      <xilinx:autoFamilySupportLevel>level_1</xilinx:autoFamilySupportLevel>
       <xilinx:definitionSource>package_project</xilinx:definitionSource>
+      <xilinx:vendorDisplayName>AMD</xilinx:vendorDisplayName>
       <xilinx:coreRevision>5</xilinx:coreRevision>
-      <xilinx:coreCreationDateTime>2020-10-09T15:31:57Z</xilinx:coreCreationDateTime>
+      <xilinx:upgrades>
+        <xilinx:canUpgradeFrom>user.org:user:memstream_axi_wrapper:1.0</xilinx:canUpgradeFrom>
+      </xilinx:upgrades>
+      <xilinx:coreCreationDateTime>2023-05-24T06:34:57Z</xilinx:coreCreationDateTime>
+      <xilinx:tags>
+        <xilinx:tag xilinx:name="nopcore"/>
+      </xilinx:tags>
     </xilinx:coreExtensions>
     <xilinx:packagingInfo>
-      <xilinx:xilinxVersion>2020.1</xilinx:xilinxVersion>
-      <xilinx:checksum xilinx:scope="busInterfaces" xilinx:value="8f86a494"/>
-      <xilinx:checksum xilinx:scope="memoryMaps" xilinx:value="5a080bee"/>
-      <xilinx:checksum xilinx:scope="fileGroups" xilinx:value="d633e93f"/>
-      <xilinx:checksum xilinx:scope="ports" xilinx:value="2e562330"/>
-      <xilinx:checksum xilinx:scope="hdlParameters" xilinx:value="134f154d"/>
-      <xilinx:checksum xilinx:scope="parameters" xilinx:value="83e5a517"/>
+      <xilinx:xilinxVersion>2022.2</xilinx:xilinxVersion>
+      <xilinx:checksum xilinx:scope="busInterfaces" xilinx:value="aace24af"/>
+      <xilinx:checksum xilinx:scope="memoryMaps" xilinx:value="b683eac1"/>
+      <xilinx:checksum xilinx:scope="fileGroups" xilinx:value="7304ec2c"/>
+      <xilinx:checksum xilinx:scope="ports" xilinx:value="8c876e99"/>
+      <xilinx:checksum xilinx:scope="hdlParameters" xilinx:value="6488ba6f"/>
+      <xilinx:checksum xilinx:scope="parameters" xilinx:value="10eb550f"/>
+      <xilinx:targetDRCs>
+        <xilinx:targetDRC xilinx:tool="ipi">
+          <xilinx:targetDRCOption xilinx:name="ignore_freq_hz" xilinx:value="true"/>
+        </xilinx:targetDRC>
+      </xilinx:targetDRCs>
     </xilinx:packagingInfo>
   </spirit:vendorExtensions>
 </spirit:component>
diff --git a/finn-rtllib/memstream/gui/memstream_v1_0.gtcl b/finn-rtllib/memstream/gui/memstream_v1_0.gtcl
index a68b85e1f5..271f9df453 100644
--- a/finn-rtllib/memstream/gui/memstream_v1_0.gtcl
+++ b/finn-rtllib/memstream/gui/memstream_v1_0.gtcl
@@ -1,2 +1,2 @@
 # This file is automatically written.  Do not modify.
-proc gen_USERPARAMETER_AXILITE_ADDR_WIDTH_VALUE {MEM_DEPTH MEM_WIDTH } {expr 2+ceil(log($MEM_DEPTH*pow(2,ceil(log(($MEM_WIDTH+31)/32)/log(2))))/log(2))}
+proc gen_USERPARAMETER_AXILITE_ADDR_WIDTH_VALUE {DEPTH WIDTH } {expr 2 + ceil(log($DEPTH*pow(2, ceil(log(($WIDTH+31)/32)/log(2))))/log(2))}
diff --git a/finn-rtllib/memstream/hdl/Q_srl.v b/finn-rtllib/memstream/hdl/Q_srl.v
index b4e89628a4..11cef604e0 100644
--- a/finn-rtllib/memstream/hdl/Q_srl.v
+++ b/finn-rtllib/memstream/hdl/Q_srl.v
@@ -69,7 +69,7 @@
 `define Q_srl
 
 
-module Q_srl (clock, reset, i_d, i_v, i_r, o_d, o_v, o_r, count);
+module Q_srl (clock, reset, i_d, i_v, i_r, o_d, o_v, o_r, count, maxcount);
 
    parameter depth = 16;   // - greatest #items in queue  (2 <= depth <= 256)
    parameter width = 16;   // - width of data (i_d, o_d)
@@ -90,7 +90,9 @@ module Q_srl (clock, reset, i_d, i_v, i_r, o_d, o_v, o_r, count);
    wire               o_b;	// - output stream back-pressure
 
    output [addrwidth:0] count;  // - output number of elems in queue
+   output [addrwidth:0] maxcount;  // - maximum observed count since reset
 
+   reg [addrwidth:0] maxcount_reg;  // - maximum count seen until now
    reg    [addrwidth-1:0] addr, addr_, a_;		// - SRL16 address
 							//     for data output
    reg 			  shift_en_;			// - SRL16 shift enable
@@ -124,6 +126,7 @@ module Q_srl (clock, reset, i_d, i_v, i_r, o_d, o_v, o_r, count);
    assign o_d = srlo;				// - output data from queue
    assign o_v = o_v_reg;			// - output valid if non-empty
    assign i_b = i_b_reg;			// - input bp if full
+   assign maxcount = maxcount_reg;
 
    assign i_r = !i_b;
    assign o_b = !o_r;
@@ -139,7 +142,10 @@ module Q_srl (clock, reset, i_d, i_v, i_r, o_d, o_v, o_r, count);
 	 addr      <= 0;
          addr_full <= 0;
 	 o_v_reg   <= 0;
-	 i_b_reg   <= 1;
+
+	 i_b_reg   <= 0;
+	 maxcount_reg <= 0;
+
       end
       else begin
 	 state     <= state_;
@@ -147,6 +153,7 @@ module Q_srl (clock, reset, i_d, i_v, i_r, o_d, o_v, o_r, count);
          addr_full <= addr_full_;
 	 o_v_reg   <= o_v_reg_;
 	 i_b_reg   <= i_b_reg_;
+	 maxcount_reg <= (count > maxcount_reg ? count : maxcount_reg);
       end
    end // always @ (posedge clock)
 
diff --git a/finn-rtllib/memstream/hdl/memstream.sv b/finn-rtllib/memstream/hdl/memstream.sv
new file mode 100644
index 0000000000..9cbef493a3
--- /dev/null
+++ b/finn-rtllib/memstream/hdl/memstream.sv
@@ -0,0 +1,176 @@
+/**
+ * Copyright (c) 2023, Xilinx
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice, this
+ *   list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ *
+ * * Neither the name of FINN nor the names of its
+ *   contributors may be used to endorse or promote products derived from
+ *   this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @author	Thomas B. Preußer <thomas.preusser@amd.com>
+ */
+
+module memstream #(
+	int unsigned  DEPTH,
+	int unsigned  WIDTH,
+
+	parameter  INIT_FILE = "",
+	parameter  RAM_STYLE = "auto"
+)(
+	input	logic  clk,
+	input	logic  rst,
+
+	// Configuration and readback interface - compatible with ap_memory
+	input	logic  config_ce,
+	input	logic  config_we,
+	input	logic [31     :0]  config_address,
+	input	logic [WIDTH-1:0]  config_d0,
+
+	output	logic  config_rack,
+	output	logic [WIDTH-1:0]  config_q0,
+
+	// Continuous output stream
+	input	logic  ordy,
+	output	logic  ovld,
+	output	logic [WIDTH-1:0]  odat
+);
+
+	typedef logic [$clog2(DEPTH)-1:0]  addr_t;
+	typedef logic [WIDTH        -1:0]  data_t;
+
+	uwire  en;       // Pipeline enable
+	uwire  rollback; // Rollback stream reads if backpressure would block read back
+
+	// Counter with pre-computed last indication for val == DEPTH-1
+	typedef struct {
+		addr_t  val;
+		logic   lst;
+	} ptr_t;
+
+	// Counter history to facilitate pipeline rollback
+	ptr_t  Ptr[3] = '{
+		0: '{ val: 0, lst: DEPTH<2 },
+		default: '{ default: 'x }
+	};
+
+	//-----------------------------------------------------------------------
+	// Stage #0: Address & Op
+	logic  Wr1 = 0;  // Write
+	logic  Rb1 = 0;  // Read back
+	logic  Rs1 = 0;  // Read stream
+	data_t  Data1 = 'x;
+	if(1) begin : blkStage1
+		// Increment for wrapping DEPTH-1 back to zero
+		localparam int unsigned  WRAP_INC = 2**$bits(addr_t) - DEPTH + 1;
+
+		uwire ptr_t  ptr_eff = rollback? Ptr[2] : Ptr[0];
+		uwire ptr_t  ptr_nxt;
+		assign	ptr_nxt.val = ptr_eff.val + (config_ce? 0 : !ptr_eff.lst? 1 : WRAP_INC);
+		assign	ptr_nxt.lst =
+			DEPTH < 2?   1 :
+			config_ce?   ptr_eff.lst :
+			ptr_eff.lst? 0 :
+			/* else */   ptr_eff.val == DEPTH-2;
+
+		always_ff @(posedge clk) begin
+			if(rst)      Ptr[0] <= '{ val: 0, lst: DEPTH<2 };
+			else if(en)  Ptr[0] <= ptr_nxt;
+		end
+
+		// Issue next Memory Operation
+		always_ff @(posedge clk) begin
+			if(rst) begin
+				Wr1 <= 0;
+				Rb1 <= 0;
+				Rs1 <= 0;
+				Ptr[1] <= '{ default : 'x };
+				Data1  <= 'x;
+			end
+			else if(en) begin
+				Wr1 <= 0;
+				Rb1 <= 0;
+				Rs1 <= 0;
+				if(config_ce) begin
+					if(config_we)  Wr1 <= 1;
+					else           Rb1 <= 1;
+					Ptr[1] <= '{ val: config_address, lst: 'x };
+					Data1  <= config_d0;
+				end
+				else begin
+					Rs1 <= 1;
+					Ptr[1] <= ptr_eff;
+					Data1  <= 'x;
+				end
+			end
+		end
+	end : blkStage1
+
+	//-----------------------------------------------------------------------
+	// Stage #2: Memory Access
+	logic   Rb2 = 0;
+	logic   Rs2 = 0;
+	data_t  Data2 = 'x;
+	if(1) begin : blkStage2
+		(* RAM_STYLE = RAM_STYLE *)
+		data_t  Mem[DEPTH];
+
+		// Optional Memory Initialization
+		if(INIT_FILE != "")  initial $readmemh(INIT_FILE, Mem);
+
+		// Execute Memory Operation
+		uwire addr_t  addr = Ptr[1].val;
+		always_ff @(posedge clk) begin
+			if(en) begin
+				if(Wr1)  Mem[addr] <= Data1;
+				Data2 <= Mem[addr];
+			end
+		end
+
+		// Copy Output Designation
+		always_ff @(posedge clk) begin
+			if(rst) begin
+				Rb2 <= 0;
+				Rs2 <= 0;
+				Ptr[2] <= '{ default: 'x };
+			end
+			else if(en) begin
+				Rb2 <= Rb1;
+				Rs2 <= Rs1 && !rollback;
+				Ptr[2] <= Ptr[1];
+			end
+		end
+	end : blkStage2
+
+	//-----------------------------------------------------------------------
+	// Output Interfaces
+	assign	config_rack = Rb2;
+	assign	config_q0 = Data2;
+
+	assign	ovld = Rs2;
+	assign	odat = Data2;
+
+	uwire  backpressure = Rs2 && !ordy;
+	assign	rollback = backpressure && (Rb1 || config_ce);
+	assign	en       = !backpressure || Rb1 || config_ce;
+
+endmodule : memstream
diff --git a/finn-rtllib/memstream/hdl/memstream.v b/finn-rtllib/memstream/hdl/memstream.v
deleted file mode 100644
index 2cd955f8d1..0000000000
--- a/finn-rtllib/memstream/hdl/memstream.v
+++ /dev/null
@@ -1,327 +0,0 @@
-/*
- Copyright (c) 2020, Xilinx
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice, this
-   list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above copyright notice,
-   this list of conditions and the following disclaimer in the documentation
-   and/or other materials provided with the distribution.
-
- * Neither the name of FINN nor the names of its
-   contributors may be used to endorse or promote products derived from
-   this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-module memstream
-#(
-//parameters to enable/disable axi-mm, set number of streams, set readmemh for memory, set per-stream offsets in memory, set per-stream widths
-    parameter CONFIG_EN = 1,
-    parameter NSTREAMS = 6,//1 up to 6
-
-    parameter MEM_DEPTH = 13824,
-    parameter MEM_WIDTH = 32,
-    parameter MEM_INIT = "./",
-    parameter RAM_STYLE = "auto",
-
-    //widths per stream
-	parameter STRM0_WIDTH = 32,
-	parameter STRM1_WIDTH = 32,
-	parameter STRM2_WIDTH = 32,
-	parameter STRM3_WIDTH = 32,
-	parameter STRM4_WIDTH = 32,
-	parameter STRM5_WIDTH = 32,
-
-	//depths per stream
-	parameter STRM0_DEPTH = 2304,
-	parameter STRM1_DEPTH = 2304,
-	parameter STRM2_DEPTH = 2304,
-	parameter STRM3_DEPTH = 2304,
-	parameter STRM4_DEPTH = 2304,
-	parameter STRM5_DEPTH = 2304,
-
-	//offsets for each stream
-	parameter STRM0_OFFSET = 0,
-	parameter STRM1_OFFSET = 2304,
-	parameter STRM2_OFFSET = 4608,
-	parameter STRM3_OFFSET = 6912,
-	parameter STRM4_OFFSET = 9216,
-	parameter STRM5_OFFSET = 11520,
-
-    parameter AXILITE_ADDR_WIDTH = 2+$clog2(MEM_DEPTH*(1<<$clog2((MEM_WIDTH+31)/32)))
-)
-
-(
-    input aclk,
-    input aresetn,
-
-    output awready,
-    input                       awvalid,
-    input [AXILITE_ADDR_WIDTH-1:0]      awaddr,
-    input [2:0]                 awprot,
-    //write data
-    output                  wready,
-    input                       wvalid,
-    input [31:0]      wdata,
-    input [3:0]  wstrb,
-    //burst response
-    input                       bready,
-    output                  bvalid,
-    output [1:0]            bresp,
-
-    //Read channels
-    //read address
-    output                  arready,
-    input                       arvalid,
-    input [AXILITE_ADDR_WIDTH-1:0]      araddr,
-    input [2:0]                 arprot,
-    //read data
-    input                       rready,
-    output                  rvalid,
-    output [1:0]            rresp,
-    output [31:0] rdata,
-
-    //multiple output AXI Streams, TDATA width rounded to multiple of 8 bits
-    input m_axis_0_afull,
-    input m_axis_0_tready,
-    output m_axis_0_tvalid,
-    output [((STRM0_WIDTH+7)/8)*8-1:0] m_axis_0_tdata,
-
-    input m_axis_1_afull,
-    input m_axis_1_tready,
-    output m_axis_1_tvalid,
-    output [((STRM1_WIDTH+7)/8)*8-1:0] m_axis_1_tdata,
-
-    input m_axis_2_afull,
-    input m_axis_2_tready,
-    output m_axis_2_tvalid,
-    output [((STRM2_WIDTH+7)/8)*8-1:0] m_axis_2_tdata,
-
-    input m_axis_3_afull,
-    input m_axis_3_tready,
-    output m_axis_3_tvalid,
-    output [((STRM3_WIDTH+7)/8)*8-1:0] m_axis_3_tdata,
-
-    input m_axis_4_afull,
-    input m_axis_4_tready,
-    output m_axis_4_tvalid,
-    output [((STRM4_WIDTH+7)/8)*8-1:0] m_axis_4_tdata,
-
-    input m_axis_5_afull,
-    input m_axis_5_tready,
-    output m_axis_5_tvalid,
-    output [((STRM5_WIDTH+7)/8)*8-1:0] m_axis_5_tdata
-
-
-);
-
-wire [31:0] config_address;
-wire config_ce;
-wire config_we;
-wire config_rack;
-wire [MEM_WIDTH-1:0] config_d0;
-wire [MEM_WIDTH-1:0] config_q0;
-
-generate
-if(NSTREAMS <= 2) begin: singleblock
-
-
-memstream_singleblock
-#(
-    .CONFIG_EN(CONFIG_EN),
-    .NSTREAMS(NSTREAMS),
-    .MEM_DEPTH(MEM_DEPTH),
-    .MEM_WIDTH(MEM_WIDTH),
-    .MEM_INIT(MEM_INIT),
-    .RAM_STYLE(RAM_STYLE),
-
-    //widths per stream
-    .STRM0_WIDTH(STRM0_WIDTH),
-    .STRM1_WIDTH(STRM1_WIDTH),
-
-    //depths per stream
-    .STRM0_DEPTH(STRM0_DEPTH),
-    .STRM1_DEPTH(STRM1_DEPTH),
-
-    //offsets for each stream
-    .STRM0_OFFSET(STRM0_OFFSET),
-    .STRM1_OFFSET(STRM1_OFFSET)
-)
-mem
-(
-    .aclk(aclk),
-    .aresetn(aresetn),
-
-    .config_address(config_address),
-    .config_ce(config_ce),
-    .config_we(config_we),
-    .config_d0(config_d0),
-    .config_q0(config_q0),
-    .config_rack(config_rack),
-
-    .m_axis_0_tready(m_axis_0_tready),
-    .m_axis_0_tvalid(m_axis_0_tvalid),
-    .m_axis_0_tdata(m_axis_0_tdata),
-
-    .m_axis_1_tready(m_axis_1_tready),
-    .m_axis_1_tvalid(m_axis_1_tvalid),
-    .m_axis_1_tdata(m_axis_1_tdata)
-);
-
-assign m_axis_2_tvalid = 0;
-assign m_axis_2_tdata = 0;
-assign m_axis_3_tvalid = 0;
-assign m_axis_3_tdata = 0;
-assign m_axis_4_tvalid = 0;
-assign m_axis_4_tdata = 0;
-assign m_axis_5_tvalid = 0;
-assign m_axis_5_tdata = 0;
-
-end else begin: multiblock
-
-
-memstream_multiblock
-#(
-    .CONFIG_EN(CONFIG_EN),
-    .NSTREAMS(NSTREAMS),
-    .MEM_DEPTH(MEM_DEPTH),
-    .MEM_WIDTH(MEM_WIDTH),
-    .MEM_INIT(MEM_INIT),
-    .RAM_STYLE(RAM_STYLE),
-
-    //widths per stream
-    .STRM0_WIDTH(STRM0_WIDTH),
-    .STRM1_WIDTH(STRM1_WIDTH),
-    .STRM2_WIDTH(STRM2_WIDTH),
-    .STRM3_WIDTH(STRM3_WIDTH),
-    .STRM4_WIDTH(STRM4_WIDTH),
-    .STRM5_WIDTH(STRM5_WIDTH),
-
-    //depths per stream
-    .STRM0_DEPTH(STRM0_DEPTH),
-    .STRM1_DEPTH(STRM1_DEPTH),
-    .STRM2_DEPTH(STRM2_DEPTH),
-    .STRM3_DEPTH(STRM3_DEPTH),
-    .STRM4_DEPTH(STRM4_DEPTH),
-    .STRM5_DEPTH(STRM5_DEPTH),
-
-    //offsets for each stream
-    .STRM0_OFFSET(STRM0_OFFSET),
-    .STRM1_OFFSET(STRM1_OFFSET),
-    .STRM2_OFFSET(STRM2_OFFSET),
-    .STRM3_OFFSET(STRM3_OFFSET),
-    .STRM4_OFFSET(STRM4_OFFSET),
-    .STRM5_OFFSET(STRM5_OFFSET)
-)
-mem
-(
-    .aclk(aclk),
-    .aresetn(aresetn),
-
-    .config_address(config_address),
-    .config_ce(config_ce),
-    .config_we(config_we),
-    .config_d0(config_d0),
-    .config_q0(config_q0),
-
-    .m_axis_0_afull(m_axis_0_afull),
-    .m_axis_0_tready(m_axis_0_tready),
-    .m_axis_0_tvalid(m_axis_0_tvalid),
-    .m_axis_0_tdata(m_axis_0_tdata),
-
-    .m_axis_1_afull(m_axis_1_afull),
-    .m_axis_1_tready(m_axis_1_tready),
-    .m_axis_1_tvalid(m_axis_1_tvalid),
-    .m_axis_1_tdata(m_axis_1_tdata),
-
-    .m_axis_2_afull(m_axis_2_afull),
-    .m_axis_2_tready(m_axis_2_tready),
-    .m_axis_2_tvalid(m_axis_2_tvalid),
-    .m_axis_2_tdata(m_axis_2_tdata),
-
-    .m_axis_3_afull(m_axis_3_afull),
-    .m_axis_3_tready(m_axis_3_tready),
-    .m_axis_3_tvalid(m_axis_3_tvalid),
-    .m_axis_3_tdata(m_axis_3_tdata),
-
-    .m_axis_4_afull(m_axis_4_afull),
-    .m_axis_4_tready(m_axis_4_tready),
-    .m_axis_4_tvalid(m_axis_4_tvalid),
-    .m_axis_4_tdata(m_axis_4_tdata),
-
-    .m_axis_5_afull(m_axis_5_afull),
-    .m_axis_5_tready(m_axis_5_tready),
-    .m_axis_5_tvalid(m_axis_5_tvalid),
-    .m_axis_5_tdata(m_axis_5_tdata)
-
-);
-
-
-end
-endgenerate
-
-axi4lite_if
-#(
-    .ADDR_WIDTH(AXILITE_ADDR_WIDTH),
-    .DATA_WIDTH(32),
-    .IP_DATA_WIDTH(MEM_WIDTH)
-)
-config_if
-(
-    //system signals
-    .aclk(aclk),
-    .aresetn(aresetn),
-
-    //Write channels
-    //write address
-    .awready(awready),
-    .awvalid(awvalid),
-    .awaddr(awaddr),
-    .awprot(awprot),
-    //write data
-    .wready(wready),
-    .wvalid(wvalid),
-    .wdata(wdata),
-    .wstrb(wstrb),
-    //burst response
-    .bready(bready),
-    .bvalid(bvalid),
-    .bresp(bresp),
-
-    //Read channels
-    //read address
-    .arready(arready),
-    .arvalid(arvalid),
-    .araddr(araddr),
-    .arprot(arprot),
-    //read data
-    .rready(rready),
-    .rvalid(rvalid),
-    .rresp(rresp),
-    .rdata(rdata),
-
-    //IP-side interface
-    .ip_en(config_ce),
-    .ip_wen(config_we),
-    .ip_addr(config_address),
-    .ip_wdata(config_d0),
-    .ip_rack(config_rack),
-    .ip_rdata(config_q0)
-);
-
-endmodule
diff --git a/finn-rtllib/memstream/hdl/memstream_axi.sv b/finn-rtllib/memstream/hdl/memstream_axi.sv
new file mode 100644
index 0000000000..136bcb1d7e
--- /dev/null
+++ b/finn-rtllib/memstream/hdl/memstream_axi.sv
@@ -0,0 +1,136 @@
+/**
+ * Copyright (c) 2023, Xilinx
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice, this
+ *   list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ *
+ * * Neither the name of FINN nor the names of its
+ *   contributors may be used to endorse or promote products derived from
+ *   this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @author	Thomas B. Preußer <thomas.preusser@amd.com>
+ */
+
+module memstream_axi #(
+	int unsigned  DEPTH,
+	int unsigned  WIDTH,
+
+	parameter  INIT_FILE = "",
+	parameter  RAM_STYLE = "auto",
+
+	localparam int unsigned  AXILITE_ADDR_WIDTH = $clog2(DEPTH * (2**$clog2((WIDTH+31)/32))) + 2
+)(
+	// Global Control
+	input	logic  clk,
+	input	logic  rst,
+
+	// AXI-lite Write
+	output	logic  awready,
+	input	logic  awvalid,
+	input	logic [2:0]  awprot,
+	input	logic [AXILITE_ADDR_WIDTH-1:0]  awaddr,
+
+	output	logic  wready,
+	input	logic  wvalid,
+	input	logic [31:0]  wdata,
+	input	logic [ 3:0]  wstrb,
+
+	input	logic  bready,
+	output	logic  bvalid,
+	output	logic [1:0]  bresp,
+
+	// AXI-lite Read
+	output	logic  arready,
+	input	logic  arvalid,
+	input	logic [2:0]  arprot,
+	input	logic [AXILITE_ADDR_WIDTH-1:0]  araddr,
+
+	input	logic  rready,
+	output	logic  rvalid,
+	output	logic [ 1:0]  rresp,
+	output	logic [31:0]  rdata,
+
+	// Continuous output stream
+	input	logic  m_axis_0_tready,
+	output	logic  m_axis_0_tvalid,
+	output	logic [((WIDTH+7)/8)*8-1:0]  m_axis_0_tdata
+);
+
+	//-----------------------------------------------------------------------
+	// AXI-lite to ap_memory Adapter
+	uwire [31:0]  config_address;
+	uwire  config_ce;
+	uwire  config_we;
+	uwire  config_rack;
+	uwire [WIDTH-1:0]  config_d0;
+	uwire [WIDTH-1:0]  config_q0;
+	axi4lite_if #(
+		.ADDR_WIDTH(AXILITE_ADDR_WIDTH),
+		.DATA_WIDTH(32),
+		.IP_DATA_WIDTH(WIDTH)
+	) config_if (
+		.aclk(clk), .aresetn(!rst),
+
+		// Write Channels
+		.awready, .awvalid, .awaddr, .awprot,
+		.wready,  .wvalid,  .wdata,  .wstrb,
+		.bready,  .bvalid,  .bresp,
+
+		// Read Channels
+		.arready, .arvalid, .araddr, .arprot,
+		.rready,  .rvalid,  .rresp,  .rdata,
+
+		// IP-side Interface
+		.ip_en(config_ce),
+		.ip_wen(config_we),
+		.ip_addr(config_address),
+		.ip_wdata(config_d0),
+		.ip_rack(config_rack),
+		.ip_rdata(config_q0)
+	);
+
+	//-----------------------------------------------------------------------
+	// Streaming Memory Backend
+	memstream #(
+		.DEPTH(DEPTH),
+		.WIDTH(WIDTH),
+		.INIT_FILE(INIT_FILE),
+		.RAM_STYLE(RAM_STYLE)
+	) mem (
+		.clk, .rst,
+
+		.config_address,
+		.config_ce,
+		.config_we,
+		.config_d0,
+		.config_q0,
+		.config_rack,
+
+		.ordy(m_axis_0_tready),
+		.ovld(m_axis_0_tvalid),
+		.odat(m_axis_0_tdata[WIDTH-1:0])
+	);
+	if($bits(m_axis_0_tdata) > WIDTH) begin
+		assign	m_axis_0_tdata[$left(m_axis_0_tdata):WIDTH] = '0;
+	end
+
+endmodule : memstream_axi
diff --git a/finn-rtllib/memstream/hdl/memstream_axi_wrapper.v b/finn-rtllib/memstream/hdl/memstream_axi_wrapper.v
new file mode 100644
index 0000000000..13f5c82d6e
--- /dev/null
+++ b/finn-rtllib/memstream/hdl/memstream_axi_wrapper.v
@@ -0,0 +1,123 @@
+/**
+ * Copyright (c) 2023, Xilinx
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice, this
+ *   list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ *
+ * * Neither the name of FINN nor the names of its
+ *   contributors may be used to endorse or promote products derived from
+ *   this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @author	Thomas B. Preußer <thomas.preusser@amd.com>
+ */
+
+module memstream_axi_wrapper #(
+	parameter  DEPTH = 512,
+	parameter  WIDTH = 32,
+
+	parameter  INIT_FILE = "",
+	parameter  RAM_STYLE = "auto",
+
+	parameter  AXILITE_ADDR_WIDTH = $clog2(DEPTH * (2**$clog2((WIDTH+31)/32))) + 2
+)(
+	// Global Control
+	(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF m_axis_0, ASSOCIATED_RESET ap_rst_n" *)
+	(* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk CLK" *)
+	input	ap_clk,
+	(* X_INTERFACE_PARAMETER = "POLARITY ACTIVE_LOW" *)
+	input	ap_rst_n,
+
+	// AXI-lite Write
+	output	awready,
+	input	awvalid,
+	input	[2:0]  awprot,
+	input	[AXILITE_ADDR_WIDTH-1:0]  awaddr,
+
+	output	wready,
+	input	wvalid,
+	input	[31:0]  wdata,
+	input	[ 3:0]  wstrb,
+
+	input	bready,
+	output	bvalid,
+	output	[1:0]  bresp,
+
+	// AXI-lite Read
+	output	arready,
+	input	arvalid,
+	input	[2:0]  arprot,
+	input	[AXILITE_ADDR_WIDTH-1:0]  araddr,
+
+	input	rready,
+	output	rvalid,
+	output	[ 1:0]  rresp,
+	output	[31:0]  rdata,
+
+	// Continuous output stream
+	input	m_axis_0_tready,
+	output	m_axis_0_tvalid,
+	output	[((WIDTH+7)/8)*8-1:0]  m_axis_0_tdata
+);
+
+	localparam  INIT_FILTERED =
+`ifdef SYNTHESIS
+		RAM_STYLE == "ultra"? "" :
+`endif
+		INIT_FILE;
+
+	memstream_axi #(
+		.DEPTH(DEPTH), .WIDTH(WIDTH),
+		.INIT_FILE(INIT_FILTERED),
+		.RAM_STYLE(RAM_STYLE)
+	) core (
+		.clk(ap_clk), .rst(!ap_rst_n),
+
+		// AXI-lite Write
+		.awready(awready),
+		.awvalid(awvalid),
+		.awprot(awprot),
+		.awaddr(awaddr),
+		.wready(wready),
+		.wvalid(wvalid),
+		.wdata(wdata),
+		.wstrb(wstrb),
+		.bready(bready),
+		.bvalid(bvalid),
+		.bresp(bresp),
+
+		// AXI-lite Read
+		.arready(arready),
+		.arvalid(arvalid),
+		.arprot(arprot),
+		.araddr(araddr),
+		.rready(rready),
+		.rvalid(rvalid),
+		.rresp(rresp),
+		.rdata(rdata),
+
+		// Continuous output stream
+		.m_axis_0_tready(m_axis_0_tready),
+		.m_axis_0_tvalid(m_axis_0_tvalid),
+		.m_axis_0_tdata(m_axis_0_tdata)
+	);
+
+endmodule : memstream_axi_wrapper
diff --git a/finn-rtllib/memstream/hdl/memstream_multiblock.v b/finn-rtllib/memstream/hdl/memstream_multiblock.v
deleted file mode 100644
index 4e6167132d..0000000000
--- a/finn-rtllib/memstream/hdl/memstream_multiblock.v
+++ /dev/null
@@ -1,474 +0,0 @@
-/*
- Copyright (c) 2020, Xilinx
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice, this
-   list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above copyright notice,
-   this list of conditions and the following disclaimer in the documentation
-   and/or other materials provided with the distribution.
-
- * Neither the name of FINN nor the names of its
-   contributors may be used to endorse or promote products derived from
-   this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-module memstream_multiblock
-#(
-//parameters to enable/disable axi-mm, set number of streams, set readmemh for memory, set per-stream offsets in memory, set per-stream widths
-    parameter CONFIG_EN = 1,
-    parameter NSTREAMS = 6,//1 up to 6
-
-    parameter MEM_DEPTH = 13824,
-    parameter MEM_WIDTH = 32,
-    parameter MEM_INIT = "./",
-    parameter RAM_STYLE = "auto",
-
-    //widths per stream
-	parameter STRM0_WIDTH = 32,
-	parameter STRM1_WIDTH = 32,
-	parameter STRM2_WIDTH = 32,
-	parameter STRM3_WIDTH = 32,
-	parameter STRM4_WIDTH = 32,
-	parameter STRM5_WIDTH = 32,
-
-	//depths per stream
-	parameter STRM0_DEPTH = 2304,
-	parameter STRM1_DEPTH = 2304,
-	parameter STRM2_DEPTH = 2304,
-	parameter STRM3_DEPTH = 2304,
-	parameter STRM4_DEPTH = 2304,
-	parameter STRM5_DEPTH = 2304,
-
-	//offsets for each stream
-	parameter STRM0_OFFSET = 0,
-	parameter STRM1_OFFSET = 2304,
-	parameter STRM2_OFFSET = 4608,
-	parameter STRM3_OFFSET = 6912,
-	parameter STRM4_OFFSET = 9216,
-	parameter STRM5_OFFSET = 11520
-)
-
-(
-    input aclk,
-    input aresetn,
-
-    //optional configuration interface compatible with ap_memory
-	input [31:0] config_address,
-	input config_ce,
-	input config_we,
-	input [31:0] config_d0,
-	output [31:0] config_q0,
-    output config_rack,
-
-    //multiple output AXI Streams, TDATA width rounded to multiple of 8 bits
-    input m_axis_0_afull,
-    input m_axis_0_tready,
-    output m_axis_0_tvalid,
-    output [((STRM0_WIDTH+7)/8)*8-1:0] m_axis_0_tdata,
-
-    input m_axis_1_afull,
-    input m_axis_1_tready,
-    output m_axis_1_tvalid,
-    output [((STRM1_WIDTH+7)/8)*8-1:0] m_axis_1_tdata,
-
-    input m_axis_2_afull,
-    input m_axis_2_tready,
-    output m_axis_2_tvalid,
-    output [((STRM2_WIDTH+7)/8)*8-1:0] m_axis_2_tdata,
-
-    input m_axis_3_afull,
-    input m_axis_3_tready,
-    output m_axis_3_tvalid,
-    output [((STRM3_WIDTH+7)/8)*8-1:0] m_axis_3_tdata,
-
-    input m_axis_4_afull,
-    input m_axis_4_tready,
-    output m_axis_4_tvalid,
-    output [((STRM4_WIDTH+7)/8)*8-1:0] m_axis_4_tdata,
-
-    input m_axis_5_afull,
-    input m_axis_5_tready,
-    output m_axis_5_tvalid,
-    output [((STRM5_WIDTH+7)/8)*8-1:0] m_axis_5_tdata
-
-
-);
-
-//calculate number of RAMB18 blocks we need depth-wise
-localparam NMEMBLOCKS = (MEM_DEPTH+1023) / 1024; //ceil(MEM_DEPTH/1024)
-
-//calculate width of address for each block
-localparam BLOCKADRWIDTH = NMEMBLOCKS > 1 ? 10 : $clog2(MEM_DEPTH);
-
-//determine whether a stream needs to multiplex between memory blocks
-localparam STRM0_MUX = ((STRM0_OFFSET/1024) != ((STRM0_OFFSET+STRM0_DEPTH)/1024));
-localparam STRM1_MUX = ((STRM1_OFFSET/1024) != ((STRM1_OFFSET+STRM1_DEPTH)/1024));
-localparam STRM2_MUX = ((STRM2_OFFSET/1024) != ((STRM2_OFFSET+STRM2_DEPTH)/1024));
-localparam STRM3_MUX = ((STRM3_OFFSET/1024) != ((STRM3_OFFSET+STRM3_DEPTH)/1024));
-localparam STRM4_MUX = ((STRM4_OFFSET/1024) != ((STRM4_OFFSET+STRM4_DEPTH)/1024));
-localparam STRM5_MUX = ((STRM5_OFFSET/1024) != ((STRM5_OFFSET+STRM5_DEPTH)/1024));
-
-//determine what the base block of each stream is
-localparam STRM0_BLOCK = (STRM0_OFFSET/1024);
-localparam STRM1_BLOCK = (STRM1_OFFSET/1024);
-localparam STRM2_BLOCK = (STRM2_OFFSET/1024);
-localparam STRM3_BLOCK = (STRM3_OFFSET/1024);
-localparam STRM4_BLOCK = (STRM4_OFFSET/1024);
-localparam STRM5_BLOCK = (STRM5_OFFSET/1024);
-
-//determine what the end block of each stream is
-localparam STRM0_END_BLOCK = ((STRM0_OFFSET+STRM0_DEPTH-1)/1024);
-localparam STRM1_END_BLOCK = ((STRM1_OFFSET+STRM1_DEPTH-1)/1024);
-localparam STRM2_END_BLOCK = ((STRM2_OFFSET+STRM2_DEPTH-1)/1024);
-localparam STRM3_END_BLOCK = ((STRM3_OFFSET+STRM3_DEPTH-1)/1024);
-localparam STRM4_END_BLOCK = ((STRM4_OFFSET+STRM4_DEPTH-1)/1024);
-localparam STRM5_END_BLOCK = ((STRM5_OFFSET+STRM5_DEPTH-1)/1024);
-
-//determine the number of blocks spanned by each stream
-localparam STRM0_NBLOCKS = STRM0_END_BLOCK - STRM0_BLOCK + 1;
-localparam STRM1_NBLOCKS = STRM1_END_BLOCK - STRM1_BLOCK + 1;
-localparam STRM2_NBLOCKS = STRM2_END_BLOCK - STRM2_BLOCK + 1;
-localparam STRM3_NBLOCKS = STRM3_END_BLOCK - STRM3_BLOCK + 1;
-localparam STRM4_NBLOCKS = STRM4_END_BLOCK - STRM4_BLOCK + 1;
-localparam STRM5_NBLOCKS = STRM5_END_BLOCK - STRM5_BLOCK + 1;
-
-//TODO: check that memory width is equal to the widest stream
-//TODO: check that the stream depths and offsets make sense, and that the memory depth is sufficient (or calculate depth here?)
-initial begin
-    if((NSTREAMS < 1) | (NSTREAMS > 6)) begin
-        $display("Invalid setting for NSTREAMS, please set in range [1,6]");
-        $finish();
-    end
-end
-
-//invert reset
-wire rst;
-assign rst = ~aresetn;
-
-//WARNING: pipeline depth is larger than the number of streams per port so we have in-flight writes that may see not-ready when they get executed
-//solution: use prog-full to make sure we have an equal number of free slots in the stream to the read pipeline depth
-
-reg [$clog2(MEM_DEPTH)-1:0] strm0_addr = STRM0_OFFSET;
-reg [$clog2(MEM_DEPTH)-1:0] strm1_addr = STRM1_OFFSET;
-reg [$clog2(MEM_DEPTH)-1:0] strm2_addr = STRM2_OFFSET;
-reg [$clog2(MEM_DEPTH)-1:0] strm3_addr = STRM3_OFFSET;
-reg [$clog2(MEM_DEPTH)-1:0] strm4_addr = STRM4_OFFSET;
-reg [$clog2(MEM_DEPTH)-1:0] strm5_addr = STRM5_OFFSET;
-
-reg strm0_incr_en;
-reg strm1_incr_en;
-reg strm2_incr_en;
-reg strm3_incr_en;
-reg strm4_incr_en;
-reg strm5_incr_en;
-
-wire strm0_rst;
-wire strm1_rst;
-wire strm2_rst;
-wire strm3_rst;
-wire strm4_rst;
-wire strm5_rst;
-
-reg strm0_ready;
-reg strm1_ready;
-reg strm2_ready;
-reg strm3_ready;
-reg strm4_ready;
-reg strm5_ready;
-
-//arbiter: work on one stream at a time
-//multiplex each port between (up to) half of the streams
-reg [1:0] current_stream_porta = 0;
-reg [1:0] current_stream_portb = 0;
-
-always @(posedge aclk) begin
-    if(rst)
-        current_stream_porta <= 0;
-    else case(current_stream_porta)
-        0: current_stream_porta <= strm2_ready ? 1 : strm4_ready ? 2 : 0;
-        1: current_stream_porta <= strm4_ready ? 2 : strm0_ready ? 0 : 1;
-        2: current_stream_porta <= strm0_ready ? 0 : strm2_ready ? 1 : 2;
-    endcase
-    if(rst)
-        current_stream_portb <= 0;
-    else case(current_stream_portb)
-        0: current_stream_portb <= strm3_ready ? 1 : strm5_ready ? 2 : 0;
-        1: current_stream_portb <= strm5_ready ? 2 : strm1_ready ? 0 : 1;
-        2: current_stream_portb <= strm1_ready ? 0 : strm3_ready ? 1 : 2;
-    endcase
-end
-
-always @(posedge aclk) begin
-    if(rst) begin
-        strm0_incr_en <= 0;
-        strm1_incr_en <= 0;
-        strm2_incr_en <= 0;
-        strm3_incr_en <= 0;
-        strm4_incr_en <= 0;
-        strm5_incr_en <= 0;
-    end else begin
-        strm0_incr_en <= (current_stream_porta == 0) & strm0_ready;
-        strm1_incr_en <= (current_stream_portb == 0) & strm1_ready;
-        strm2_incr_en <= (current_stream_porta == 1) & strm2_ready;
-        strm3_incr_en <= (current_stream_portb == 1) & strm3_ready;
-        strm4_incr_en <= (current_stream_porta == 2) & strm4_ready;
-        strm5_incr_en <= (current_stream_portb == 2) & strm5_ready;
-    end
-end
-
-assign strm0_rst = strm0_incr_en & (strm0_addr == (STRM0_OFFSET + STRM0_DEPTH-1));
-assign strm1_rst = strm1_incr_en & (strm1_addr == (STRM1_OFFSET + STRM1_DEPTH-1));
-assign strm2_rst = strm2_incr_en & (strm2_addr == (STRM2_OFFSET + STRM2_DEPTH-1));
-assign strm3_rst = strm3_incr_en & (strm3_addr == (STRM3_OFFSET + STRM3_DEPTH-1));
-assign strm4_rst = strm4_incr_en & (strm4_addr == (STRM4_OFFSET + STRM4_DEPTH-1));
-assign strm5_rst = strm5_incr_en & (strm5_addr == (STRM5_OFFSET + STRM5_DEPTH-1));
-
-always @(posedge aclk) begin
-    strm0_ready <= ~m_axis_0_afull;
-    strm1_ready <= ~m_axis_1_afull & (NSTREAMS >= 2);
-    strm2_ready <= ~m_axis_2_afull & (NSTREAMS >= 3);
-    strm3_ready <= ~m_axis_3_afull & (NSTREAMS >= 4);
-    strm4_ready <= ~m_axis_4_afull & (NSTREAMS >= 5);
-    strm5_ready <= ~m_axis_5_afull & (NSTREAMS >= 6);
-end
-
-//one address counter per stream; more LUTs but keeps routing short and local
-always @(posedge aclk) begin
-    if(strm0_rst | rst)
-        strm0_addr <= STRM0_OFFSET;
-    else if(strm0_incr_en)
-        strm0_addr <= strm0_addr + 1;
-    if(strm1_rst | rst)
-        strm1_addr <= STRM1_OFFSET;
-    else if(strm1_incr_en)
-        strm1_addr <= strm1_addr + 1;
-    if(strm2_rst | rst)
-        strm2_addr <= STRM2_OFFSET;
-    else if(strm2_incr_en)
-        strm2_addr <= strm2_addr + 1;
-    if(strm3_rst | rst)
-        strm3_addr <= STRM3_OFFSET;
-    else if(strm3_incr_en)
-        strm3_addr <= strm3_addr + 1;
-    if(strm4_rst | rst)
-        strm4_addr <= STRM4_OFFSET;
-    else if(strm4_incr_en)
-        strm4_addr <= strm4_addr + 1;
-    if(strm5_rst | rst)
-        strm5_addr <= STRM5_OFFSET;
-    else if(strm5_incr_en)
-        strm5_addr <= strm5_addr + 1;
-end
-
-reg [$clog2(MEM_DEPTH)-1:0] addra;
-wire [MEM_WIDTH*NMEMBLOCKS-1:0] rdqa;
-
-reg [$clog2(MEM_DEPTH)-1:0] addrb;
-wire [MEM_WIDTH*NMEMBLOCKS-1:0] rdqb;
-
-wire [NMEMBLOCKS-1:0] we;
-
-reg [1:0] addr_select_porta;
-reg [1:0] addr_select_portb;
-
-//multiplex addresses of various streams into address ports of memory
-always @(posedge aclk) begin
-    addr_select_porta <= current_stream_porta;
-    case(addr_select_porta)
-        0: addra <= strm0_addr;
-        1: addra <= strm2_addr;
-        2: addra <= strm4_addr;
-    endcase
-    addr_select_portb <= current_stream_portb;
-    case(addr_select_portb)
-        0: addrb <= strm1_addr;
-        1: addrb <= strm3_addr;
-        2: addrb <= strm5_addr;
-    endcase
-end
-
-genvar g;
-generate for(g=0; g<NMEMBLOCKS; g=g+1) begin: blockports
-
-assign we[g] = (CONFIG_EN == 1) & config_ce & config_we & (config_address[31:BLOCKADRWIDTH] == g);
-
-ramb18_wf_dualport
-#(
-    .ID(g),
-	.DWIDTH(MEM_WIDTH),
-	.AWIDTH(BLOCKADRWIDTH),
-	.MEM_INIT(MEM_INIT),
-  .RAM_STYLE(RAM_STYLE)
-)
-ram
-(
-	.clk(aclk),
-
-	.wea(we[g]),
-    .ena(1'b1),
-    .enqa(1'b1),
-	.addra(we[g] ? config_address[BLOCKADRWIDTH-1:0] : addra[BLOCKADRWIDTH-1:0]),
-	.wdataa(config_d0),
-	.rdqa(rdqa[(g+1)*MEM_WIDTH-1:g*MEM_WIDTH]),
-
-	.web(1'b0),
-    .enb(1'b1),
-    .enqb(1'b1),
-	.addrb(addrb[BLOCKADRWIDTH-1:0]),
-	.wdatab('d0),
-	.rdqb(rdqb[(g+1)*MEM_WIDTH-1:g*MEM_WIDTH])
-);
-
-end
-endgenerate
-
-integer i;
-
-generate if(NMEMBLOCKS > 1) begin: multiblock
-
-wire [MEM_WIDTH-1:0] rdqmux[5:0];
-
-reg [$clog2(MEM_DEPTH)-BLOCKADRWIDTH-1:0] rdblocka[2:0];
-reg [$clog2(MEM_DEPTH)-BLOCKADRWIDTH-1:0] rdblockb[2:0];
-
-always @(posedge aclk) begin
-    rdblocka[0] <= addra[$clog2(MEM_DEPTH)-1:BLOCKADRWIDTH];
-    rdblockb[0] <= addrb[$clog2(MEM_DEPTH)-1:BLOCKADRWIDTH];
-    for(i=0; i<2; i=i+1) begin
-		rdblocka[i+1] <= rdblocka[i];
-		rdblockb[i+1] <= rdblockb[i];
-    end
-end
-
-if(NSTREAMS >= 1) begin: en_strm0
-	if(STRM0_MUX == 1) begin: mux0
-		mux #(STRM0_NBLOCKS, MEM_WIDTH) m(rdqa[(STRM0_BLOCK+STRM0_NBLOCKS)*MEM_WIDTH-1:STRM0_BLOCK*MEM_WIDTH],rdqmux[0],rdblocka[1] - STRM0_BLOCK);
-	end else begin: nomux0
-		assign rdqmux[0] = rdqa[(STRM0_BLOCK+1)*MEM_WIDTH-1:STRM0_BLOCK*MEM_WIDTH];
-	end
-	assign m_axis_0_tdata = rdqmux[0][STRM0_WIDTH-1:0];
-end
-
-if(NSTREAMS >= 2) begin: en_strm1
-	if(STRM1_MUX == 1) begin: mux1
-		mux #(STRM1_NBLOCKS, MEM_WIDTH) m(rdqb[(STRM1_BLOCK+STRM1_NBLOCKS)*MEM_WIDTH-1:STRM1_BLOCK*MEM_WIDTH],rdqmux[1],rdblockb[1] - STRM1_BLOCK);
-	end else begin: nomux1
-		assign rdqmux[1] = rdqb[(STRM1_BLOCK+1)*MEM_WIDTH-1:STRM1_BLOCK*MEM_WIDTH];
-	end
-	assign m_axis_1_tdata = rdqmux[1][STRM1_WIDTH-1:0];
-end
-
-if(NSTREAMS >= 3) begin: en_strm2
-	if(STRM2_MUX == 1) begin: mux2
-		mux #(STRM2_NBLOCKS, MEM_WIDTH) m(rdqa[(STRM2_BLOCK+STRM2_NBLOCKS)*MEM_WIDTH-1:STRM2_BLOCK*MEM_WIDTH],rdqmux[2],rdblocka[1] - STRM2_BLOCK);
-	end else begin: nomux2
-		assign rdqmux[2] = rdqa[(STRM2_BLOCK+1)*MEM_WIDTH-1:STRM2_BLOCK*MEM_WIDTH];
-	end
-	assign m_axis_2_tdata = rdqmux[2][STRM2_WIDTH-1:0];
-end
-
-if(NSTREAMS >= 4) begin: en_strm3
-	if(STRM3_MUX == 1) begin: mux3
-		mux #(STRM3_NBLOCKS, MEM_WIDTH) m(rdqb[(STRM3_BLOCK+STRM3_NBLOCKS)*MEM_WIDTH-1:STRM3_BLOCK*MEM_WIDTH],rdqmux[3],rdblockb[1] - STRM3_BLOCK);
-	end else begin: nomux3
-		assign rdqmux[3] = rdqb[(STRM3_BLOCK+1)*MEM_WIDTH-1:STRM3_BLOCK*MEM_WIDTH];
-	end
-	assign m_axis_3_tdata = rdqmux[3][STRM3_WIDTH-1:0];
-end
-
-if(NSTREAMS >= 5) begin: en_strm4
-	if(STRM4_MUX == 1) begin: mux4
-		mux #(STRM4_NBLOCKS, MEM_WIDTH) m(rdqa[(STRM4_BLOCK+STRM4_NBLOCKS)*MEM_WIDTH-1:STRM4_BLOCK*MEM_WIDTH],rdqmux[4],rdblocka[1] - STRM4_BLOCK);
-	end else begin: nomux4
-		assign rdqmux[4] = rdqa[(STRM4_BLOCK+1)*MEM_WIDTH-1:STRM4_BLOCK*MEM_WIDTH];
-	end
-	assign m_axis_4_tdata = rdqmux[4][STRM4_WIDTH-1:0];
-end
-
-if(NSTREAMS >= 6) begin: en_strm5
-	if(STRM5_MUX == 1) begin: mux5
-		mux #(STRM5_NBLOCKS, MEM_WIDTH) m(rdqb[(STRM5_BLOCK+STRM5_NBLOCKS)*MEM_WIDTH-1:STRM5_BLOCK*MEM_WIDTH],rdqmux[5],rdblockb[1] - STRM5_BLOCK);
-	end else begin: nomux5
-		assign rdqmux[5] = rdqb[(STRM5_BLOCK+1)*MEM_WIDTH-1:STRM5_BLOCK*MEM_WIDTH];
-	end
-	assign m_axis_5_tdata = rdqmux[5][STRM5_WIDTH-1:0];
-end
-
-end else begin: singleblock
-
-if(NSTREAMS >= 1) begin: en_strm0_direct
-    assign m_axis_0_tdata = rdqa[STRM0_WIDTH-1:0];
-end
-if(NSTREAMS >= 2) begin: en_strm1_direct
-	assign m_axis_1_tdata = rdqb[STRM1_WIDTH-1:0];
-end
-if(NSTREAMS >= 3) begin: en_strm2_direct
-	assign m_axis_2_tdata = rdqa[STRM2_WIDTH-1:0];
-end
-if(NSTREAMS >= 4) begin: en_strm3_direct
-	assign m_axis_3_tdata = rdqb[STRM3_WIDTH-1:0];
-end
-if(NSTREAMS >= 5) begin: en_strm4_direct
-	assign m_axis_4_tdata = rdqa[STRM4_WIDTH-1:0];
-end
-if(NSTREAMS >= 6) begin: en_strm5_direct
-	assign m_axis_5_tdata = rdqb[STRM5_WIDTH-1:0];
-end
-
-end
-endgenerate
-
-//output to AXI Streams
-reg tvalid_pipe0[2:0];
-reg tvalid_pipe1[2:0];
-reg tvalid_pipe2[2:0];
-reg tvalid_pipe3[2:0];
-reg tvalid_pipe4[2:0];
-reg tvalid_pipe5[2:0];
-
-assign m_axis_0_tvalid = tvalid_pipe0[2];
-assign m_axis_1_tvalid = tvalid_pipe1[2];
-assign m_axis_2_tvalid = tvalid_pipe2[2];
-assign m_axis_3_tvalid = tvalid_pipe3[2];
-assign m_axis_4_tvalid = tvalid_pipe4[2];
-assign m_axis_5_tvalid = tvalid_pipe5[2];
-
-
-always @(posedge aclk) begin
-    tvalid_pipe0[0] <= strm0_incr_en;
-    tvalid_pipe1[0] <= strm1_incr_en;
-    tvalid_pipe2[0] <= strm2_incr_en;
-    tvalid_pipe3[0] <= strm3_incr_en;
-    tvalid_pipe4[0] <= strm4_incr_en;
-    tvalid_pipe5[0] <= strm5_incr_en;
-    for(i=0; i<2; i=i+1) begin: srl
-        tvalid_pipe0[i+1] <= tvalid_pipe0[i];
-        tvalid_pipe1[i+1] <= tvalid_pipe1[i];
-        tvalid_pipe2[i+1] <= tvalid_pipe2[i];
-        tvalid_pipe3[i+1] <= tvalid_pipe3[i];
-        tvalid_pipe4[i+1] <= tvalid_pipe4[i];
-        tvalid_pipe5[i+1] <= tvalid_pipe5[i];
-    end
-end
-
-//dummy read, for now
-assign config_q0 = 0;
-assign config_rack = config_ce & ~config_we;
-
-endmodule
diff --git a/finn-rtllib/memstream/hdl/memstream_singleblock.v b/finn-rtllib/memstream/hdl/memstream_singleblock.v
deleted file mode 100644
index c9b8770aaa..0000000000
--- a/finn-rtllib/memstream/hdl/memstream_singleblock.v
+++ /dev/null
@@ -1,246 +0,0 @@
-/*
- Copyright (c) 2020, Xilinx
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice, this
-   list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above copyright notice,
-   this list of conditions and the following disclaimer in the documentation
-   and/or other materials provided with the distribution.
-
- * Neither the name of FINN nor the names of its
-   contributors may be used to endorse or promote products derived from
-   this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-/*
-    Implements a lightweight streamer for up to 2 streams in a single block of memory
-*/
-
-module memstream_singleblock
-#(
-    parameter CONFIG_EN = 1,
-    parameter NSTREAMS = 2,//1 up to 2
-
-    parameter MEM_DEPTH = 512,
-    parameter MEM_WIDTH = 32,
-    parameter MEM_INIT = "./",
-    parameter RAM_STYLE = "auto",
-
-    //widths per stream
-	parameter STRM0_WIDTH = 32,
-	parameter STRM1_WIDTH = 32,
-
-	//depths per stream
-	parameter STRM0_DEPTH = 256,
-	parameter STRM1_DEPTH = 256,
-
-	//offsets for each stream
-	parameter STRM0_OFFSET = 0,
-	parameter STRM1_OFFSET = 256
-)
-
-(
-    input aclk,
-    input aresetn,
-
-    //optional configuration interface compatible with ap_memory
-	input [31:0] config_address,
-	input config_ce,
-	input config_we,
-	input [MEM_WIDTH-1:0] config_d0,
-	output [MEM_WIDTH-1:0] config_q0,
-    output config_rack,
-
-    //multiple output AXI Streams, TDATA width rounded to multiple of 8 bits
-    input m_axis_0_tready,
-    output m_axis_0_tvalid,
-    output [((STRM0_WIDTH+7)/8)*8-1:0] m_axis_0_tdata,
-
-    input m_axis_1_tready,
-    output m_axis_1_tvalid,
-    output [((STRM1_WIDTH+7)/8)*8-1:0] m_axis_1_tdata
-
-);
-
-
-//TODO: check that memory width is equal to the widest stream
-//TODO: check that the stream depths and offsets make sense, and that the memory depth is sufficient (or calculate depth here?)
-initial begin
-    if((NSTREAMS < 1) | (NSTREAMS > 2)) begin
-        $display("Invalid setting for NSTREAMS, please set in range [1,2]");
-        $finish();
-    end
-end
-
-//invert reset
-wire rst;
-assign rst = ~aresetn;
-
-wire strm0_incr_en;
-wire strm1_incr_en;
-
-assign strm0_incr_en = m_axis_0_tready | ~m_axis_0_tvalid;
-assign strm1_incr_en = m_axis_1_tready | ~m_axis_1_tvalid;
-
-reg rack_shift[1:0];
-
-generate
-if(MEM_DEPTH > 1) begin: use_ram
-
-//calculate width of memory address, with a minimum of 1 bit
-localparam BLOCKADRWIDTH = $clog2(MEM_DEPTH);
-
-reg [BLOCKADRWIDTH-1:0] strm0_addr = STRM0_OFFSET;
-wire strm0_rst;
-assign strm0_rst = strm0_incr_en & (strm0_addr == (STRM0_OFFSET + STRM0_DEPTH-1));
-
-//one address counter per stream; more LUTs but keeps routing short and local
-always @(posedge aclk) begin
-    if(strm0_rst | rst)
-        strm0_addr <= STRM0_OFFSET;
-    else if(strm0_incr_en)
-        strm0_addr <= strm0_addr + 1;
-end
-
-if(NSTREAMS == 1) begin: sdp
-
-ramb18_sdp
-#(
-    .ID(0),
-	.DWIDTH(MEM_WIDTH),
-	.AWIDTH(BLOCKADRWIDTH),
-    .DEPTH(MEM_DEPTH),
-	.MEM_INIT(MEM_INIT),
-    .RAM_STYLE(RAM_STYLE)
-)
-ram
-(
-	.clk(aclk),
-
-    .ena(config_ce),
-	.wea(config_we),
-	.addra(config_address[BLOCKADRWIDTH-1:0]),
-    .wdataa(config_d0),
-
-    .enb(strm0_incr_en | config_ce),
-    .enqb(strm0_incr_en | rack_shift[0]),
-	.addrb(config_ce ? config_address[BLOCKADRWIDTH-1:0] : strm0_addr),
-	.rdqb(m_axis_0_tdata)
-);
-
-
-end else begin: tdp
-
-reg [BLOCKADRWIDTH-1:0] strm1_addr = STRM1_OFFSET;
-wire strm1_rst;
-assign strm1_rst = strm1_incr_en & (strm1_addr == (STRM1_OFFSET + STRM1_DEPTH-1));
-
-always @(posedge aclk) begin
-    if(strm1_rst | rst)
-        strm1_addr <= STRM1_OFFSET;
-    else if(strm1_incr_en)
-        strm1_addr <= strm1_addr + 1;
-end
-
-ramb18_wf_dualport
-#(
-    .ID(0),
-	.DWIDTH(MEM_WIDTH),
-	.AWIDTH(BLOCKADRWIDTH),
-    .DEPTH(MEM_DEPTH),
-	.MEM_INIT(MEM_INIT),
-    .RAM_STYLE(RAM_STYLE)
-)
-ram
-(
-	.clk(aclk),
-
-	.wea(config_we),
-    .ena(strm0_incr_en | config_ce),
-    .enqa(strm0_incr_en | config_ce_r),
-	.addra(config_we ? config_address[BLOCKADRWIDTH-1:0] : strm0_addr),
-	.wdataa(config_d0),
-	.rdqa(m_axis_0_tdata),
-
-	.web(1'b0),
-    .enb(strm1_incr_en),
-    .enqb(strm1_incr_en),
-	.addrb(strm1_addr),
-	.wdatab('d0),
-	.rdqb(m_axis_1_tdata)
-);
-
-end
-
-end else begin: bypass
-
-reg [MEM_WIDTH-1:0] singleval[0:0];
-initial begin
-    `ifdef SYNTHESIS
-        $readmemh({MEM_INIT,"memblock_synth_0.dat"}, singleval, 0, 0);
-    `else
-        $readmemh({MEM_INIT,"memblock_sim_0.dat"}, singleval, 0, 0);
-    `endif
-end
-
-always @(posedge aclk)
-    if(config_ce & config_we)
-        singleval[0] <= config_d0;
-
-assign m_axis_0_tdata = singleval[0];
-assign m_axis_1_tdata = singleval[0];
-
-end
-endgenerate
-
-//signal valid after 2 tready cycles after initialization
-//then stay valid
-reg [1:0] tvalid_pipe0 = 2'd0;
-reg [1:0] tvalid_pipe1 = 2'd0;
-
-assign m_axis_0_tvalid = tvalid_pipe0[1];
-assign m_axis_1_tvalid = tvalid_pipe1[1];
-
-always @(posedge aclk) begin
-    if(rst) begin
-        tvalid_pipe0 <= 0;
-    end else if(strm0_incr_en) begin
-        tvalid_pipe0[0] <= 1;
-        tvalid_pipe0[1] <= tvalid_pipe0[0];
-    end
-end
-
-always @(posedge aclk) begin
-    if(rst) begin
-        tvalid_pipe1 <= 0;
-    end else if(strm1_incr_en) begin
-        tvalid_pipe1[0] <= 1;
-        tvalid_pipe1[1] <= tvalid_pipe1[0];
-    end
-end
-
-always @(posedge aclk) begin
-    rack_shift[0] <= config_ce & ~config_we;
-    rack_shift[1] <= rack_shift[0];
-end
-
-assign config_rack = rack_shift[1];
-assign config_q0 = m_axis_0_tdata;
-
-endmodule
diff --git a/finn-rtllib/memstream/hdl/mux.v b/finn-rtllib/memstream/hdl/mux.v
deleted file mode 100644
index f7087f9735..0000000000
--- a/finn-rtllib/memstream/hdl/mux.v
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- Copyright (c) 2020, Xilinx
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice, this
-   list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above copyright notice,
-   this list of conditions and the following disclaimer in the documentation
-   and/or other materials provided with the distribution.
-
- * Neither the name of FINN nor the names of its
-   contributors may be used to endorse or promote products derived from
-   this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-module mux
-#(
-    parameter NINPUTS = 1,
-	parameter WIDTH = 16
-)
-(
-	input [NINPUTS*WIDTH-1:0] in,
-	output [WIDTH-1:0] out,
-	input [$clog2(NINPUTS)-1:0] sel
-);
-
-assign out = in >> (sel*WIDTH);
-
-endmodule
diff --git a/finn-rtllib/memstream/hdl/ramb18_sdp.v b/finn-rtllib/memstream/hdl/ramb18_sdp.v
deleted file mode 100644
index 8d2fbf9a98..0000000000
--- a/finn-rtllib/memstream/hdl/ramb18_sdp.v
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- Copyright (c) 2020, Xilinx
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice, this
-   list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above copyright notice,
-   this list of conditions and the following disclaimer in the documentation
-   and/or other materials provided with the distribution.
-
- * Neither the name of FINN nor the names of its
-   contributors may be used to endorse or promote products derived from
-   this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-module ramb18_sdp
-#(
-    parameter ID = 0,
-    parameter DWIDTH = 18,
-    parameter AWIDTH = 10,
-    parameter DEPTH = 2**AWIDTH,
-    parameter MEM_INIT = "",
-    parameter RAM_STYLE = "auto"
-)
-(
-	input clk,
-
-	input ena,
-	input wea,
-	input [AWIDTH-1:0] addra,
-	input [DWIDTH-1:0] wdataa,
-
-    input enb,
-    input enqb,
-	input [AWIDTH-1:0] addrb,
-	output reg [DWIDTH-1:0] rdqb
-);
-
-(* ram_style = RAM_STYLE *) reg [DWIDTH-1:0] mem[0:DEPTH-1];
-reg [DWIDTH-1:0] rdatab;
-
-`ifdef SYNTHESIS
-reg [7:0] idx = ID;
-`else
-reg [15:0] idx;
-`endif
-
-//initialize memory
-initial begin
-  //note the hacky way of adding a filename memblock_ID.dat to the path provided in MEM_INIT
-  //ID can go up to 99
-  if (ID < 0 && ID > 99) begin
-    $display("ID out of range [0-99]");
-    $finish();
-  end
-	//MEM_INIT path must be terminated by /
-  `ifdef SYNTHESIS
-  if (ID < 10)
-    $readmemh({MEM_INIT,"memblock_synth_",idx+8'd48,".dat"}, mem, 0, DEPTH-1);
-  else
-    $readmemh({MEM_INIT,"memblock_synth_",(idx/10)+8'd48,(idx%10)+8'd48,".dat"}, mem, 0, DEPTH-1);
-  `else
-  $sformat(idx,"%0d",ID);
-  if (ID < 10)
-    $readmemh({MEM_INIT,"memblock_sim_",idx[7:0],".dat"}, mem, 0, DEPTH-1);
-  else
-    $readmemh({MEM_INIT,"memblock_sim_",idx,".dat"}, mem, 0, DEPTH-1);
-  `endif
-end
-
-//memory ports, with output pipeline register
-always @(posedge clk) begin
-    if(wea)
-        mem[addra] <= wdataa;
-    if(enb)
-        rdatab <= mem[addrb];
-    if(enqb)
-        rdqb <= rdatab;
-end
-
-endmodule
diff --git a/finn-rtllib/memstream/hdl/ramb18_wf_dualport.v b/finn-rtllib/memstream/hdl/ramb18_wf_dualport.v
deleted file mode 100644
index c7850106ae..0000000000
--- a/finn-rtllib/memstream/hdl/ramb18_wf_dualport.v
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
- Copyright (c) 2020, Xilinx
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice, this
-   list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above copyright notice,
-   this list of conditions and the following disclaimer in the documentation
-   and/or other materials provided with the distribution.
-
- * Neither the name of FINN nor the names of its
-   contributors may be used to endorse or promote products derived from
-   this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-module ramb18_wf_dualport
-#(
-    parameter ID = 0,
-    parameter DWIDTH = 18,
-    parameter AWIDTH = 10,
-    parameter DEPTH = 2**AWIDTH,
-    parameter MEM_INIT = "",
-    parameter RAM_STYLE = "auto"
-)
-(
-	input clk,
-
-	input wea,
-    input ena,
-    input enqa,
-	input [AWIDTH-1:0] addra,
-	input [DWIDTH-1:0] wdataa,
-	output reg [DWIDTH-1:0] rdqa,
-
-	input web,
-    input enb,
-    input enqb,
-	input [AWIDTH-1:0] addrb,
-	input [DWIDTH-1:0] wdatab,
-	output reg [DWIDTH-1:0] rdqb
-);
-
-(* ram_style = RAM_STYLE *) reg [DWIDTH-1:0] mem[0:DEPTH-1];
-reg [DWIDTH-1:0] rdataa;
-reg [DWIDTH-1:0] rdatab;
-
-`ifdef SYNTHESIS
-reg [7:0] idx = ID;
-`else
-reg [15:0] idx;
-`endif
-
-//initialize memory
-initial begin
-  //note the hacky way of adding a filename memblock_ID.dat to the path provided in MEM_INIT
-  //ID can go up to 99
-  if (ID < 0 && ID > 99) begin
-    $display("ID out of range [0-99]");
-    $finish();
-  end
-	//MEM_INIT path must be terminated by /
-  `ifdef SYNTHESIS
-  if (ID < 10)
-    $readmemh({MEM_INIT,"memblock_",idx+8'd48,".dat"}, mem, 0, DEPTH-1);
-  else
-    $readmemh({MEM_INIT,"memblock_",(idx/10)+8'd48,(idx%10)+8'd48,".dat"}, mem, 0, DEPTH-1);
-  `else
-  $sformat(idx,"%0d",ID);
-  if (ID < 10)
-    $readmemh({MEM_INIT,"memblock_",idx[7:0],".dat"}, mem, 0, DEPTH-1);
-  else
-    $readmemh({MEM_INIT,"memblock_",idx,".dat"}, mem, 0, DEPTH-1);
-  `endif
-end
-
-//memory ports, with output pipeline register
-always @(posedge clk) begin
-    if(ena) begin
-        if(wea)
-            mem[addra] <= wdataa;
-        rdataa <= mem[addra];
-    end
-    if(enqa)
-        rdqa <= rdataa;
-end
-always @(posedge clk) begin
-    if(enb) begin
-        if(web)
-            mem[addrb] <= wdatab;
-        rdatab <= mem[addrb];
-    end
-    if(enqb)
-        rdqb <= rdatab;
-end
-
-endmodule
diff --git a/finn-rtllib/memstream/sim/gen_memblocks.sh b/finn-rtllib/memstream/sim/gen_memblocks.sh
deleted file mode 100644
index b6e6b656ad..0000000000
--- a/finn-rtllib/memstream/sim/gen_memblocks.sh
+++ /dev/null
@@ -1,39 +0,0 @@
-#!/bin/bash
-
-# Copyright (c) 2020, Xilinx
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice, this
-#   list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the following disclaimer in the documentation
-#   and/or other materials provided with the distribution.
-#
-# * Neither the name of FINN nor the names of its
-#   contributors may be used to endorse or promote products derived from
-#   this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-NLINES=`cat $1 | wc -l`
-NBLOCKS=$(( ($NLINES + 1023) / 1024 ))
-rm memblock_*.dat
-
-for (( i=0; i<$NBLOCKS; i++ ))
-do
-    START=$(( 1 + $i * 1024 ))
-    tail -n +$START $1 | head -n 1024 >> memblock_$i.dat
-done
diff --git a/finn-rtllib/memstream/sim/golden.dat b/finn-rtllib/memstream/sim/golden.dat
deleted file mode 100644
index 1466271bca..0000000000
--- a/finn-rtllib/memstream/sim/golden.dat
+++ /dev/null
@@ -1,9216 +0,0 @@
-AFB2B66A            
-BB100CFF            
-1ED93E9B            
-1B8E800D            
-DA9E0150            
-38B1C916            
-93BC4E64            
-860F8373            
-B31D708B            
-C2934023            
-739C9593            
-4C898A3D            
-CCC8F4C5            
-8FA275E6            
-47732CC7            
-6857ABF0            
-31671013            
-6BC4AA43            
-73D4F790            
-2C6158B6            
-FDC3B5D            
-6DC755F2            
-E0E7E8C9            
-7862E17            
-3D4FFE1E            
-9AFFF447            
-C862FD7D            
-A4C4D89A            
-D7D6EF51            
-10E5A31D            
-79DA9C63            
-A83060A8            
-EA988813            
-6B411BCF            
-85544B5A            
-5AC91DE6            
-586E6779            
-8FE8161B            
-4C57CC92            
-74C918A6            
-36B20D44            
-5CB62FC0            
-62FDB2E1            
-4B1CB514            
-526B7CEC            
-B3FA61D0            
-C95DDBE            
-CC2BA600            
-2466CD1D            
-3354A056            
-CCED3EAC            
-6FFA09EE            
-F9648FAF            
-18CB5358            
-EA506270            
-66F385A6            
-5B0246E5            
-26218A76            
-BC7CECFD            
-5969F6FF            
-3DAF5901            
-C53D05BD            
-1EDA2D76            
-5C0C0010            
-7A6C0C8C            
-BF99E997            
-C964C884            
-4DE417F4            
-8637312            
-133B8C3A            
-D637DB88            
-297288F6            
-CF1D00B3            
-426BD0F3            
-4D258120            
-8F7EC898            
-E15482D9            
-DFDFC442            
-16A5C4AE            
-7A6A14DF            
-5E9C2807            
-31BD3EA2            
-BD6DCDBC            
-E47CD35E            
-FA4FE42            
-CCDE0036            
-345EBCB7            
-64686255            
-AE1D77EB            
-D2B42B84            
-CD5E5824            
-8DABAB1F            
-4E07FFCA            
-7F3B4C13            
-1A62C962            
-CE08835F            
-E8E05318            
-DC25C7BF            
-132E4308            
-5D0122D6            
-B7451ACE            
-829D2507            
-19329C7F            
-39FCA8F0            
-DCD1A574            
-17E2EEE            
-B2B6583A            
-2181E65            
-7013A2A7            
-46535CDE            
-C85BF5D3            
-2FD5EFC2            
-E05C5D2E            
-244F0F96            
-F01D711F            
-F1CBB67E            
-6DAE6666            
-84AD6F4A            
-B95BC84E            
-9DD54B95            
-5A7CA1B            
-7B1447F4            
-44A8EDA7            
-20929E9            
-40E62E02            
-3D03CC3E            
-81EEF8C4            
-1E686D13            
-17C13B3D            
-A14967BE            
-D8693E0E            
-15A7FDD1            
-19F51C6D            
-249D0C21            
-51424939            
-BA05F551            
-C614827A            
-32841A0D            
-2F8B041            
-11A2806            
-DBF24199            
-F246D9EB            
-52FFB23D            
-F3061A47            
-B6D51EF3            
-2DE434C3            
-E1D3F874            
-85270B0A            
-CC405B14            
-DD3E9F23            
-A0352F98            
-67EE5731            
-96892C65            
-6D67A443            
-16354414            
-17959F75            
-A554F236            
-C585076            
-2B665011            
-7D503509            
-77A4530            
-6A13C8DC            
-31996F5            
-916AD400            
-E761D000            
-D23CFD32            
-CF3A5154            
-C575A1CB            
-B91ACDBF            
-BEE7F338            
-44C26212            
-8124CD5B            
-245F7451            
-DD6D18BA            
-6B838EC6            
-5247AB98            
-2F41FDAA            
-A780BD3B            
-1FD2F95            
-6CDA39C            
-C31FA5A0            
-AB56A5E1            
-87F50441            
-47093971            
-BEBD81EC            
-2A7F6977            
-8C83BD29            
-FB067DAC            
-5FEBDCDC            
-8FB43F72            
-EE45FC6D            
-4088691C            
-34F235D0            
-43AB8E4D            
-67FA8BB5            
-FC2D2C02            
-DA77044C            
-22E6FC7            
-6B6039A9            
-BA6E3C45            
-46DEC612            
-8E7E0FF7            
-438DE467            
-F4525025            
-7937973A            
-9ABE4BEF            
-8F8DF841            
-F74C5087            
-7EDE1CA4            
-FF3C7F98            
-A025FE0B            
-59E5EDF6            
-6DD27411            
-65C080E6            
-C86D872D            
-628B6B26            
-B9316D56            
-E09EFA8B            
-A8CD3F21            
-C0CD8745            
-F4D62BA7            
-D4D7FB99            
-E9174232            
-7F068FC4            
-767480FC            
-275BBBF7            
-3470FF88            
-E632ACD1            
-85677507            
-AE0E2C69            
-E2C74DA9            
-C307B72B            
-5FB5A769            
-99C18162            
-FAFB7660            
-6E984733            
-E17FD97B            
-EC5E6CA7            
-3D659815            
-30826B60            
-300BE8E8            
-86D0B096            
-856F2CB0            
-2A61ADE4            
-24EEB996            
-2FCB729B            
-8190CE0D            
-E64F7E6A            
-4D0D42F            
-CE29765B            
-C77DE893            
-9264C299            
-A200E419            
-868B5EC6            
-8452AC39            
-59F7BDED            
-422E75B2            
-74E6329A            
-38F053E8            
-16F8BD5A            
-363A2E43            
-8018AB7B            
-44AE4CF5            
-C8F7B14B            
-52658A45            
-7B46C7D8            
-CD319C38            
-19AC8957            
-5F42CFAA            
-5DB4DBF7            
-DF66DDBA            
-4FBCB611            
-266DFB86            
-4F0EE64C            
-1765E724            
-E30C89CA            
-4705FCE8            
-BB7636B3            
-789EFEFC            
-AAC0F37F            
-424B1661            
-234F05AB            
-1BC0ADF8            
-7F9EC67E            
-500448E5            
-BF4D1C45            
-C5B64E3B            
-914F44FE            
-EB17F041            
-1752165C            
-F5B72E31            
-6D68C060            
-4EF27C55            
-8CEDFDC5            
-E3996A56            
-25C5C632            
-430D930F            
-EE04DE4D            
-576E4921            
-E13A2A6E            
-CFE21675            
-B1067912            
-4C888068            
-3C3A1A6D            
-FCE12E0            
-FAD6AD8B            
-F7DE2E0F            
-E8DC0DE7            
-CC8721DF            
-34411355            
-2C664D07            
-ED034324            
-F57FDA56            
-8C70BCDF            
-3A6FF2C8            
-C6440537            
-8113D976            
-A40176A1            
-46D1D0D9            
-877A407C            
-3FBCD395            
-3E74C1D8            
-72E22A13            
-BA46116D            
-CFB14406            
-21400896            
-7AD34367            
-2905F60C            
-C1F9C16F            
-2E0E5FCF            
-2EEB00A0            
-9C2D94A9            
-8DE1CF01            
-5912596C            
-CF2CA22A            
-774E7D4F            
-805657AE            
-1BA223EF            
-236FD53F            
-C1ABFD4A            
-6B8DD778            
-6A6E40D2            
-70CF4F79            
-950E8D35            
-5E4F9545            
-86AA4166            
-28D056E9            
-9C550D75            
-CB435A3            
-B875667E            
-F54E6E97            
-BB7ACD6B            
-F11637E9            
-C220E1FA            
-C7CAD54B            
-32853439            
-65BA20C9            
-1838F8C0            
-C3CCE57D            
-7D2B69F9            
-137AD6E9            
-6C041B9            
-296497AA            
-98C5E853            
-D37AB835            
-376764A9            
-2F714011            
-D24BE867            
-B2BA4E            
-9EA785F9            
-726FCED6            
-6B4C6950            
-44C6D5C0            
-85DEA727            
-733F5A86            
-41785CFF            
-BB395E8A            
-100F8117            
-276A08D3            
-9268A16E            
-FBF63C19            
-AA497F25            
-E92E1DC3            
-185B4692            
-FE6377D6            
-C50771B            
-D98BCD04            
-50FC7D74            
-BE5BC294            
-2C9C4482            
-12FBF6CD            
-D1E04AE4            
-5C9679EE            
-889D2695            
-3699F061            
-933D06A9            
-930DC367            
-496D7A37            
-C4161D19            
-3E08728B            
-66388C70            
-B2363734            
-5D12926F            
-39B4AEF8            
-1948B925            
-321E08BC            
-27559FC2            
-A543B709            
-4D28BC0            
-46C64305            
-F7B7D459            
-97C4966B            
-A027A9C8            
-43CABFA9            
-F7C3643D            
-1128AB2A            
-AA4A1419            
-AC6F2B46            
-8F6FEFEF            
-34284D4D            
-D951EB81            
-77AC6B7C            
-70F6E0B2            
-FD7BE3CE            
-77BE497E            
-4883FBD6            
-FCAB08D4            
-9BC032A4            
-67DA8A5C            
-82037EC1            
-E3EC6CC9            
-481B7623            
-DA1F3873            
-CE9E8787            
-785CD654            
-1661CF27            
-42BD0C3C            
-990F261A            
-49F18930            
-FA336094            
-FFD6FC06            
-B71077A6            
-204B911E            
-BA1586D6            
-8A2F6DBC            
-36B184AD            
-76017CAB            
-DA7E891E            
-88A51A1A            
-97AC49CB            
-2482BE28            
-CE6BD009            
-C7776DE0            
-4E960944            
-64081AF2            
-56512D55            
-D6D1C640            
-EE78145B            
-54CC5EE0            
-BE5D3E1F            
-8FC8816C            
-1D6AC407            
-5D98F8F1            
-18FECC5C            
-F3DE9A29            
-93A19068            
-AB623B35            
-43FF1A02            
-AA26434C            
-B071FDD5            
-45AB6A2E            
-C1275AA7            
-EADA5CDA            
-E427C95E            
-AE6E5B77            
-89F3CA30            
-9648C00A            
-330A03A7            
-20DB35D6            
-AA9946BF            
-A0E3050E            
-DEBB5819            
-5047E2E            
-9C8FBEB9            
-6B70D173            
-8A99428D            
-230C88FE            
-3B26DBD4            
-8DBED704            
-EFF1C946            
-C2381970            
-71087497            
-2268599D            
-FCE50AAE            
-460A49E5            
-EC65BC4C            
-5A83C23C            
-DD44120F            
-D6E81BEB            
-D10235B7            
-9362A387            
-B3C9220C            
-46F21F0            
-3D04FBC0            
-63A2B38D            
-8F7DEF26            
-F326457D            
-21933DC1            
-775197FB            
-8D6C7C5F            
-B2D7D570            
-147F9FF7            
-78666356            
-BAB7D249            
-69B45EC6            
-F56634ED            
-34738794            
-26DF0163            
-188DA00            
-D2035A36            
-FFBB8062            
-62852DCF            
-55FC882A            
-849388E6            
-43BE6E2C            
-D53EA2A2            
-A228BC21            
-9112A960            
-5FCDE2F1            
-79F42B27            
-8AE37179            
-1D722815            
-5AE6DD26            
-A8531C6F            
-EF386673            
-AC761B14            
-23C6BC3A            
-488D93B            
-AE6B0D63            
-A4F1CEAC            
-43F80A43            
-D9681EF6            
-BA959674            
-CCB852B8            
-D9F4D79E            
-6403622F            
-75FAECC6            
-7F43813F            
-51FC7BE6            
-896A3A28            
-CAF31C60            
-76000EE7            
-C1135AAB            
-6E83B2E6            
-2AED1966            
-C4F88A86            
-21219EA            
-8AF14AD6            
-14014BA2            
-BC0BE2D5            
-78757CE8            
-C09D83DC            
-6B2021FE            
-D5AD900            
-3685A49F            
-FD8B4BA0            
-7B005539            
-2F0C36EF            
-B41DBA0D            
-1DCF61B0            
-CB3DA1A6            
-24C0ADAA            
-BED01B2B            
-59C8C334            
-11CCA76C            
-6F962508            
-ABE672A6            
-3C281A24            
-A6C3DC39            
-A72517B1            
-FBA81175            
-9906CEE4            
-E8177FE1            
-338D0184            
-CC6650DF            
-840D8CA0            
-4C55C42B            
-6B40F9CC            
-57B7E7B7            
-B7C42442            
-4500E9B            
-8C788183            
-9B8F5FCE            
-49D0AEE1            
-426B2271            
-EC25BCE3            
-7D63A976            
-2EFFF592            
-32A9E43C            
-AF5AFA52            
-3ABE1133            
-35B75ED7            
-8F4271A9            
-725A6EF            
-7ED7EB40            
-37BD3B            
-7A0A5AF2            
-F6492D7D            
-C2856688            
-9595C241            
-C07F646A            
-7D394FDC            
-7A991B05            
-2CE3AF30            
-9929E6E6            
-4AE66BD4            
-F0F3D1A3            
-F76F72E9            
-6C2051E2            
-72431DE4            
-B1796A93            
-E04FD748            
-D19522B1            
-71396A78            
-4202F058            
-4F2CEB1E            
-A186853F            
-8B4474AA            
-C679B644            
-98E10D42            
-E7CEB08C            
-733CA225            
-3478B95C            
-A706A842            
-9510B8EB            
-F47E426E            
-9A0A17EE            
-2DA8832B            
-E73536CC            
-E6CA4B40            
-11A2708F            
-753AC1E1            
-8C304DED            
-5FC83F07            
-4F9A04C9            
-E0737708            
-9091DFDD            
-8E1B322            
-2552D768            
-7C894296            
-EABDC081            
-E3B2A37            
-DEC7EC87            
-37FFB6DC            
-2B2A0CD6            
-7E797B13            
-64ABD0C5            
-1FF12252            
-F81AFB24            
-C16F1ABC            
-F0B5AAFC            
-F80281BA            
-E51C04D            
-EEF8BD3E            
-450A49DB            
-AC985D7B            
-CBD4D077            
-CAA6370A            
-FDA6530C            
-20B71F06            
-ED5A891E            
-BA51A622            
-E9F8E132            
-63C23719            
-2F59EE96            
-14D77539            
-1A98FC31            
-12FCC937            
-F39AD8FB            
-3750DBA9            
-564E45B            
-F74C47FD            
-1010AD3A            
-8BE0AED3            
-28B27F7B            
-D5E8EEFA            
-DC0EFEFB            
-959F5394            
-A10ECCB8            
-5C366706            
-3B82A5EE            
-74E377DD            
-9881CEF3            
-D1A4BD88            
-69106661            
-B209B42            
-B56EE86B            
-63F37839            
-C5AB7736            
-4AD627C4            
-8A4C7E1C            
-F7CC6334            
-3D6CAEC4            
-A86A18D5            
-8FD910B1            
-972371C8            
-A423E9B6            
-CE8C76C7            
-DF930841            
-C9D4A7B0            
-18521955            
-F6F167FC            
-889F1625            
-432C606A            
-CA5EB4D0            
-AFE77C91            
-EAF55F16            
-6F9A9777            
-33726C1D            
-DC7B1D64            
-8031DC00            
-CF13144F            
-84BF2AB            
-45F5FD45            
-6AF06D8C            
-C50FBE6C            
-11B8A4A2            
-16B780E1            
-98033979            
-8EFAAEC0            
-DD984A5A            
-D6A80AFC            
-15C793A3            
-EF458063            
-B784551F            
-552CC380            
-D1E05EBA            
-4A795261            
-F2B25418            
-66066848            
-D935B481            
-136D2C8F            
-7A25AEFB            
-7000439A            
-E147CC62            
-68976C6E            
-69447DAB            
-C72506F3            
-C6E3FE3B            
-4FB0FD96            
-DB465740            
-A254195C            
-B11EA223            
-FC3C44B5            
-A9A86F1C            
-8EED03E3            
-24CFF3A            
-A1B488CE            
-FD75D002            
-9FEF0461            
-75DC6637            
-B3D38CD2            
-57C8F65D            
-C62026D0            
-D6320A18            
-5E961798            
-80FE0097            
-6DA57E68            
-D1E8A3C7            
-96D49CFC            
-A8D2DFBC            
-520D2C1            
-151C3F1D            
-8180DCC7            
-4461E43E            
-C895BF5C            
-18EE374            
-33EA06D4            
-75B9D006            
-23B934C1            
-C2E89F39            
-444BCB75            
-78077AA5            
-ECA64716            
-3C1E3FFD            
-F7DB9CEE            
-6EC313DD            
-9CABEC47            
-675FA281            
-16B8304D            
-3E38FEC            
-A9663BDE            
-8EF647F2            
-B646C61C            
-2228E400            
-2B411566            
-7A72EB44            
-88BD9AE9            
-4EF4EBA3            
-BCC822D9            
-4668160D            
-695667C1            
-CE51A675            
-40DE9687            
-877561EF            
-416F5AE6            
-EF9304FE            
-34C1C9D3            
-5B63E1BB            
-C50E9899            
-1831810D            
-25DE2CC1            
-10539A77            
-EE51D9B2            
-462E5A70            
-B0F8C3B7            
-CA16E410            
-1796F2E5            
-573F6B28            
-E157A965            
-2640969A            
-153B4909            
-7FC1290F            
-ABCAC2F            
-2A42D17            
-BFFA3865            
-7B12D8B9            
-9321F9EF            
-E560B7A9            
-36E18DD2            
-57710FF9            
-FAE1F933            
-F717FEF8            
-E86BAF7E            
-D0CE3E89            
-C8755650            
-704BB6ED            
-6309F650            
-E21DDB4F            
-7CBF531C            
-7E0AFB8E            
-D6A1128B            
-60F16A1B            
-534186AF            
-72971F2E            
-428A867C            
-F571D32C            
-CD522E7B            
-13F6443            
-38CDC9EC            
-D01C51E6            
-2E575D3F            
-7E86B596            
-C1460B28            
-1403B019            
-76D89A66            
-4F2D9465            
-9B87B1            
-172A00A4            
-4669559C            
-105C8A19            
-3CD2DD63            
-EF054D76            
-8B9AB48            
-64136500            
-71C56349            
-B7AEEDF5            
-4145D7AC            
-D6A3E4C7            
-2F9E0DF4            
-31E418C8            
-D2C839DE            
-63E919D9            
-2F4D0353            
-8812C572            
-B88E671F            
-54D2BBE0            
-E166998            
-B7487741            
-64312607            
-5ADF6F3E            
-31A86BF1            
-D8A96C85            
-22AA3021            
-AD4719B5            
-49EB0670            
-93B76AAF            
-B109648            
-FBC7346C            
-2530A7B5            
-C8525175            
-15EC0A76            
-315FACCE            
-D8C21A6F            
-9EDEF96D            
-6495575D            
-722A0577            
-51EDE2ED            
-8109F168            
-6CBA0929            
-1ED88DCD            
-D79A67E2            
-CE62A29C            
-6FE2A87F            
-D1E6E3B9            
-601988A0            
-6A045849            
-A7E30F35            
-E0EE4424            
-AA89C628            
-33D7A7A3            
-FCD27B7A            
-80CAF9A4            
-2E7F1302            
-69F19C            
-80DBDC64            
-392FBDC            
-E5981A33            
-B4AF4210            
-1DBFDB9F            
-31E5DF02            
-5C571556            
-EE256151            
-9F573818            
-200D540B            
-87743240            
-1335188F            
-5A1E9D1F            
-FA267CB            
-688D2302            
-80D32C1            
-195719E            
-EF151174            
-772EEC93            
-DD2E2E4E            
-D8EA362D            
-3B24FC06            
-FFFCF7FC            
-C571F2F4            
-A8DAC7D            
-3BA7880C            
-16FC184D            
-7DBC453C            
-8F355780            
-65C7ED3D            
-2202E50E            
-9EC765A9            
-9D8F8CDA            
-CFA71D0B            
-7A463A33            
-AA94D750            
-359750D8            
-B9A4BEFD            
-B153CD8C            
-93AFB5F4            
-2676E0A0            
-78C0805            
-347133            
-3B229F4D            
-4486A7BE            
-F3A0FAF3            
-D29E9349            
-A62C0FB4            
-574D3763            
-BCDAEE6E            
-BA27D40D            
-896903EB            
-8AE6171C            
-A911D78E            
-970FB490            
-33B8A631            
-893F7E3B            
-700EDF9D            
-EA7AC6E6            
-6041F473            
-FC6702EE            
-F225A258            
-96A21B4            
-CCA94D4D            
-FA6D00B7            
-35580441            
-F5E42BA            
-EE9AB535            
-50874EBA            
-4454B2B            
-30653468            
-9ABFE240            
-29A13784            
-EBF5F88F            
-B1769BB8            
-EF22637D            
-A2FEEE4E            
-4B39E8F8            
-38AD4316            
-A3FCB454            
-7D6F402            
-18CEA9F0            
-956B2CCE            
-6559ADC4            
-F00F696E            
-C878E2A3            
-3AB31BE4            
-FF2E6E3A            
-3767BE32            
-37CFBCBC            
-C307A74B            
-ED6A132B            
-8D5A1B70            
-774C41D1            
-A45F1CA9            
-3FCF576A            
-C1BBAB8C            
-5B11B23A            
-620B6C8E            
-A6F5CB83            
-450BFF8B            
-FBB9620D            
-BD936B56            
-2FBF9A89            
-2E000CD5            
-E508C955            
-2FB99422            
-5043B664            
-1C43CF3B            
-2D7E713F            
-FAD8A72B            
-7CF2FA33            
-8FDD90A6            
-8B5CDCDE            
-6CBF908F            
-740425F6            
-D142F4B9            
-2B30DF9D            
-3808D354            
-508C4729            
-E6FB0279            
-FA0F9DF5            
-2FFA33E1            
-8A93B18            
-FE7C0855            
-E69193B1            
-AA7E4DA            
-DCDD121D            
-4E7CD1            
-14C03D9            
-ACB60232            
-818C10F0            
-D8CAA46E            
-2CBC53B4            
-46F82991            
-9B24E92B            
-E1DBF265            
-C6649C            
-87D0CA2F            
-C24A605            
-AEB470E            
-8DC36FE7            
-2D6B856E            
-9B459A3A            
-5C204000            
-C7CC0BA9            
-E637D8C4            
-1F8C7240            
-41788DF4            
-27B94DFA            
-BBA5B2CD            
-51E1AB57            
-FB14B16B            
-B6821713            
-F955BAB9            
-44FEBDEF            
-A484D04E            
-FCC08A15            
-A117E11E            
-CAE09305            
-789A734A            
-338EAB60            
-183825B            
-61931C6E            
-ECBBBA86            
-1AC53895            
-BCEFB579            
-CC68D938            
-217A4ED1            
-3CC6F2DE            
-12E55EF5            
-FAE1CE98            
-CF89DDCE            
-8FEFFF33            
-8C27552E            
-6D63AA8F            
-B094E27C            
-4E7632FE            
-5D9DDBD8            
-8E2766E6            
-2EF9333E            
-98B9A7D4            
-20D98AB            
-C12C8047            
-5995F2BB            
-BB30E14            
-C769CC0E            
-632D8C76            
-B7FBE051            
-3170D046            
-D595ACCF            
-190326FC            
-D1D03166            
-DA4420CD            
-81FA57FA            
-D8615FD4            
-33AEF793            
-E2B32AB3            
-E2B2D613            
-5A37DB74            
-EBF473BC            
-62C5F8CF            
-624D5D2D            
-9A9006D4            
-8515BED2            
-7DD650C8            
-D0BABA59            
-1E635B2C            
-690CBFF7            
-E4028EC4            
-E4E5B3C2            
-57607B0E            
-D4087B2            
-3C06022A            
-813133A2            
-B206699            
-3827A132            
-985BF479            
-6C11EA62            
-F58DA68F            
-818CD2B6            
-F204828B            
-64A0D011            
-A6F07C40            
-6816D54D            
-8B00F959            
-3B6A1891            
-EF20520A            
-B5B90BD0            
-D70B3B4            
-7B165E3F            
-FBE60B95            
-50656296            
-6250C189            
-B50E29BC            
-7BBB35AE            
-124AD7B3            
-BAD38F67            
-A0CA136            
-FB03F6CB            
-B88FB36D            
-9025524E            
-4EB80454            
-D07FEA2B            
-D9385E1F            
-B1EDF69A            
-11D2AE5C            
-9EEC00C3            
-55916263            
-AAD5CF88            
-2740548B            
-662FB2DE            
-173DFA86            
-8D734BE9            
-D4A27E13            
-E92A39A2            
-A58A3F4A            
-A71CE9AC            
-B43ED5F            
-1600E2AD            
-265C4182            
-4EA4F91            
-1E3A0BD5            
-62650FD0            
-BC6E23A1            
-3BF3E963            
-5F6AFA4A            
-6BA2B659            
-5C00047A            
-E8F81B0A            
-C30BF4A0            
-DFF059E0            
-4E3F93FE            
-D688F348            
-3220541C            
-F8A72F57            
-6D78CAE6            
-AF13AA11            
-BDB3229D            
-936DA76F            
-749DB9C1            
-EBF347A6            
-BBFA776B            
-6472B218            
-6144ECA8            
-E66CD255            
-274BC846            
-64C0C67A            
-95748CF2            
-25DE3E48            
-29A685B3            
-CC8C7B15            
-F18FA7CF            
-5F2D1C01            
-6DFEC90F            
-CF834DDD            
-A72D9439            
-BC6D83C3            
-9F888C34            
-385D225F            
-168886B3            
-98EF8EB2            
-BD8ADDD1            
-80DA0EE2            
-F4196AC8            
-6F020F21            
-61136480            
-4DA28475            
-86A506E0            
-1A75F4D7            
-222C4645            
-8C4486EE            
-98560E3C            
-944205C9            
-D5E0BB3C            
-C9667421            
-2932030            
-BFE65EB0            
-FB463370            
-9FE77763            
-DE8ED32D            
-FC9BDBEE            
-FD77E3F            
-288C605F            
-7475F3D            
-C3F75513            
-C5AF2C40            
-40FB62E2            
-2C7C83E9            
-A8A7E6CC            
-512E4560            
-950C9D            
-EC507007            
-65B7CEC6            
-4A91094F            
-3BDA586B            
-7029FB6E            
-739B556A            
-678652AD            
-7B940AD3            
-4A8728BC            
-76841FC0            
-F53DEB4C            
-1B13B0F8            
-80A5CFA8            
-69C8B602            
-6F984889            
-14A53B17            
-409BF6B7            
-46D597EE            
-3502ED7D            
-315B1DE7            
-E785791            
-21871730            
-78BE7E05            
-D1536BC0            
-F9708FE6            
-EE4E143D            
-4E498B00            
-A2113F88            
-630DFE4E            
-3FA3D4B            
-F88D623D            
-3ADB0736            
-BF25AD18            
-CB89D619            
-1D41D458            
-EEFA6367            
-7671EBAB            
-B98E8CFB            
-238D9F19            
-C5155B            
-223C16B            
-E484FED9            
-DD6A6680            
-5192089B            
-CFF24757            
-F2CD17B3            
-CC3C7B1C            
-581E6ED2            
-C2D7E5D2            
-E9789543            
-424EF913            
-E6B10C7F            
-706C0B16            
-6EC36BE6            
-54C41CF4            
-CD1EAD0D            
-17460ECA            
-452A78CC            
-D680E5A2            
-57AA8EB1            
-252EB084            
-9DBB8E55            
-BF759D75            
-6E5E9F27            
-30EBEFCA            
-C4514A4F            
-FE76382B            
-99A07A25            
-F9017D0B            
-452226BA            
-3DD6111B            
-967464D            
-C0BAF41B            
-C4D39425            
-767A57E4            
-7183FC19            
-844A33A5            
-54F13F7            
-C5854DAD            
-BE406FE9            
-14340FCF            
-F665DC28            
-701D2EA1            
-A7B6AC6C            
-AC3167EF            
-C3CE6810            
-C6844D77            
-64887D7E            
-4EFF4E1C            
-8508CD3            
-45CD4361            
-3FAB9023            
-9121F935            
-46C5C6BE            
-272C83A9            
-24762973            
-EB858013            
-FF2D23BA            
-6F5C8026            
-A045E967            
-7B844395            
-2611E8E4            
-8AF4659            
-89FB4D33            
-D9F50DF4            
-CA6BD0F6            
-A47A1386            
-F78D3515            
-2E73ABAE            
-36C0297B            
-DCF0FD32            
-3930C7E1            
-246799B2            
-BF8BEEAF            
-7AD6D40C            
-7BDCB9B9            
-7829D32C            
-EC826EC9            
-ECE1D576            
-4E3D613B            
-DCB44DB2            
-67EA1BF2            
-D1DE75BF            
-4609E175            
-423132A3            
-D33DD5F6            
-D74829AF            
-FE0FB1F4            
-C32939D9            
-4FB97597            
-1441DE62            
-649D26B5            
-4835C073            
-1F67EAE0            
-E28AE826            
-DB808A84            
-58FD0074            
-1424245            
-6BD9E7E1            
-26476595            
-E8C08661            
-F1F0D3D5            
-577263A7            
-CB86C426            
-EA57839B            
-C8B37BC9            
-FBD2B525            
-D033D0BC            
-A3A0474F            
-22EDE40F            
-CCD58291            
-CB64AA7D            
-3176C162            
-78DE2512            
-ADD0A1B3            
-EB41F141            
-A7B5DAB1            
-C68652ED            
-1F8E90D            
-31578AF4            
-CFA12A8A            
-E20A88F2            
-74AA9676            
-3B353B5E            
-1956E731            
-AA8B10C0            
-63369269            
-C833A9E5            
-9425A8E4            
-89DB1783            
-1BE23F63            
-D84221B9            
-F8D9FE9B            
-EA1FD309            
-E16516F3            
-8F0EA801            
-F5256123            
-F21B02D8            
-F3335520            
-F7729F5D            
-B7F2AF17            
-6B97F182            
-806347D9            
-962A011D            
-A5427014            
-B7358896            
-E9D6A1C6            
-2E3DBDE7            
-94B06EA1            
-4B3D9107            
-26F1956B            
-1726E033            
-6660681C            
-39E4E3D5            
-E8CD4742            
-78D71E0E            
-15733521            
-89D0606F            
-D449755F            
-A2753DF9            
-AC7ED71            
-7803B9A9            
-87CCA2B4            
-23003317            
-2A91CE6            
-C37B28F5            
-CD9A436B            
-893C12E2            
-C1FB04FB            
-3D8230BC            
-737002C2            
-15314ACB            
-F4D74B95            
-6C8BCBFC            
-292459A8            
-1692BDFF            
-DC68FEB8            
-48DEF854            
-4BAE6B50            
-8B850B23            
-AEDD7125            
-5B740DA0            
-AA83A652            
-474C59D4            
-A4B2D4D3            
-451C3B83            
-D93BD101            
-BF10B243            
-8AB74771            
-68C5891            
-C8EE35CC            
-D22DC638            
-5C7FA2D3            
-54A2001A            
-747538DC            
-AC75ECD3            
-F1BBFFB4            
-844C0E4B            
-D7D25E9E            
-460EC0ED            
-688BA8D7            
-CA6E35E7            
-9396DBBA            
-3E9C3E0C            
-5D29B720            
-3E5BB85D            
-F1CFA9A            
-8EF00E21            
-28669B1B            
-98BE145D            
-2696E360            
-F91E3763            
-B0E3F6FE            
-45699C1            
-F5945549            
-2CB64CA4            
-F3508C44            
-653BABD0            
-773F51CB            
-9D228D81            
-E4FAB747            
-1DC767E3            
-89A77290            
-8E2A722            
-45D00328            
-42E979FA            
-C19D28EB            
-C6645B54            
-5AD41E9A            
-93587C5A            
-719944B2            
-B10FF0A7            
-A57FE070            
-78C8DFAE            
-138BFBAF            
-1126A4D8            
-C9DB256B            
-EE01D5FF            
-A8EB81AB            
-80AB24B4            
-95B129FD            
-802078            
-A6F71D37            
-334BFF82            
-32678187            
-4AA896B0            
-149226EB            
-5B8C446            
-D1799EBD            
-74EA35A0            
-FA9B52C8            
-FAC6A436            
-9E543685            
-C1184EE            
-2D8CF846            
-C2AFF300            
-18EED386            
-80C04036            
-77FA6FF7            
-5D1512F0            
-D2C0C9B7            
-22DBA873            
-62468BB9            
-42C90933            
-F7EA7A3C            
-69449140            
-7DD1B0F0            
-52AAADFF            
-2F8B7479            
-70B719F9            
-CD8E1081            
-4B46932            
-DB933B74            
-1E7A04BF            
-75DC735A            
-C3925701            
-7EC84718            
-DFEE049D            
-E8B3328A            
-3A9936EE            
-F2E22D2A            
-1F2B5894            
-DB44DCE5            
-4F1DD5B4            
-B66F3E9F            
-943480BE            
-ABA71BB2            
-E4F15D5B            
-4C9D7A9C            
-B751518B            
-24C9762E            
-F9DA3386            
-D13AB9B6            
-5CFC891C            
-CBEDF3E9            
-395421ED            
-5A3570B8            
-1641D0A0            
-AF9A9981            
-A07CC659            
-4BA92C0            
-D94C7431            
-AA749489            
-372456FB            
-690097AE            
-B5EF28F3            
-1F8F313B            
-6C45ECE2            
-24F4CAD9            
-40C5200C            
-920AFACD            
-A2E0DD6A            
-CEC81C6C            
-DED2D22F            
-4AEA1A34            
-7504D5DA            
-1F8E8F02            
-72100835            
-BB4AE282            
-A0154848            
-EF3ECE2D            
-6DA87A1A            
-46D17BF            
-DAE80D31            
-FA8CA757            
-8F75F943            
-AFFB5EDD            
-F1A09255            
-A80EDAB5            
-5AC04A14            
-B51A2E1E            
-FD9C51F4            
-F99A5A90            
-3EA5F0D            
-C4D40DFC            
-C0280AF9            
-CEC83127            
-FA1A5F6B            
-D603510E            
-3663D878            
-A79682FB            
-B7313271            
-7E37A2C7            
-A1CB289D            
-C51B6F15            
-EC66F0DA            
-80D5C268            
-F3A52A28            
-E056F895            
-4A0A2418            
-66E47974            
-8E8CA911            
-FD7E6D05            
-70960317            
-5D378166            
-3A2D634            
-CA6510C4            
-93BBB6AB            
-4FE2CF83            
-2273B7D4            
-E372BB74            
-8AD6B40E            
-496AA885            
-11F4186            
-8DEDF498            
-5435E535            
-5145EF8D            
-44AB3DF            
-7B449D2C            
-3489063E            
-F0A61E35            
-A2F75775            
-F691A0D2            
-9CA997F2            
-D64FFFB7            
-DA79CC6A            
-2DEA4171            
-D2E4D598            
-C641D01            
-79699CD2            
-49FF5A89            
-C967A1C4            
-F4C7FF25            
-9CD04F9A            
-374C3740            
-7B6376BD            
-ECC505A1            
-E76F3618            
-42C0B205            
-B28C63BC            
-2BA4280E            
-7278103B            
-83B861F6            
-F862D563            
-433B3F81            
-358E4226            
-2E9334B5            
-2E9B7324            
-23BF3CB0            
-1E44A323            
-BAA2480D            
-3B8483BD            
-419659C5            
-91A9B2C2            
-82574F8            
-28A32CD0            
-3534C89B            
-759FD52E            
-B260329C            
-82112334            
-2D5B7F7B            
-816C0227            
-ED5FAD1D            
-7BDFA5AE            
-B5C8006C            
-BD9691EA            
-36C28C33            
-B8702558            
-EB3E656A            
-D752A865            
-FA94FF5E            
-AE5D43C3            
-747587AD            
-6E5E5C96            
-39312BCE            
-B13B468A            
-81543486            
-1B57D2B3            
-4D3D70A7            
-2D4ECFBA            
-640E83F8            
-4FD1588B            
-4EA4599A            
-E231E4F0            
-A2D4437B            
-47D88CE6            
-D048C6D1            
-4CA7F923            
-E9E435A8            
-E93D6805            
-C032C4A6            
-E15934E3            
-CB728ED0            
-E7D65CEA            
-8E5D2F8B            
-1676D174            
-B42D23CC            
-A1462E09            
-CA718E2A            
-F5BA8F57            
-EFA467ED            
-6DA31185            
-895FB4A2            
-649A7D89            
-3B71CFA2            
-C67F9D02            
-DFBDDF09            
-AAB8BDDB            
-870C617A            
-220F7717            
-795DE75E            
-5C787D87            
-BB94CBBC            
-99928778            
-9D5C4DAB            
-4EEC433E            
-F4C08960            
-F71FE87B            
-BF78D7C6            
-671FB341            
-4EAD6A0E            
-534B1D46            
-1B4DE7CF            
-A7B45E06            
-97F43041            
-4B77382C            
-61EBC96C            
-336A9206            
-E2A6FD02            
-72E6EE51            
-26144F77            
-DD22DF66            
-CBAFB596            
-B9CE864D            
-CEBC372F            
-907981E8            
-A9FA3C97            
-6B1704B8            
-B1160637            
-FE603AC4            
-274C6ED5            
-6C317434            
-77A16703            
-2489D28D            
-2DBFB899            
-4A3D882B            
-E81AF570            
-1B8F583E            
-F1CFA601            
-C7B776D2            
-A26651A3            
-303D5E43            
-CD80678            
-7E9DCEBA            
-E0F128C5            
-4B1807BB            
-25B10534            
-4117D98B            
-95079C39            
-58C7BCE2            
-AE0AF4E3            
-331A0152            
-DB3D821C            
-F4F11B78            
-E2F55DDF            
-15BF23DA            
-15E7695F            
-1F40D321            
-128A49CA            
-2D25CD8F            
-AE762164            
-7EC8AC49            
-1D9A1899            
-97B6BAF0            
-D7E07736            
-A2566738            
-A903EE89            
-67CD354E            
-89C1C57A            
-97B3EF5C            
-240FC35D            
-52CE3A2C            
-15E8D7D2            
-6A8A9E32            
-4254550D            
-A345B8F1            
-464C5420            
-FD2E1DB2            
-C629DA54            
-81D24EFE            
-421E30F4            
-E4008742            
-62839D68            
-AD78257A            
-23DBB6EE            
-49DAE0F2            
-B1B07AAD            
-EC7791BA            
-3B4D3E2F            
-C241836D            
-C836E98A            
-EE9D6DA5            
-33B5A570            
-81D50D38            
-6EE68232            
-76677B3C            
-AF355302            
-D2415D7            
-1510CCAA            
-A6627F82            
-A5A96453            
-CD0B833E            
-5CF4C1E1            
-C14866A            
-AFB8FE0E            
-B7D08BAC            
-4CBFF97E            
-F0191C3D            
-4E2A3EC            
-E76E048            
-FF368683            
-F4DF51            
-8D0F29CD            
-91E431F5            
-B6808051            
-927E3404            
-6ADBDD1            
-5852A1E9            
-394DFE4            
-8990BE64            
-A69026EF            
-3656791E            
-63C5AC11            
-B9E88670            
-9326F9CC            
-414EFA53            
-B5028CB5            
-22181175            
-3B1A49C1            
-22FEDBAC            
-A39731D2            
-9C7E2E87            
-E931F133            
-D9AFCE3F            
-C2CC527A            
-A85B19BB            
-C66CB9EC            
-93558B54            
-F5197362            
-7EA88969            
-B380F206            
-56AC8890            
-56D0C8A6            
-B39C42A6            
-7B966768            
-1B6E37E5            
-43429273            
-668BAF0B            
-327CE28C            
-CEA34DC6            
-EA727DD9            
-2C1AE3E4            
-802A7A51            
-A1934827            
-1A18C4BF            
-AEB9CA99            
-D572EF76            
-18DFC210            
-11A4385C            
-671ED0D6            
-D1E5D02E            
-9EE0AE12            
-DF1EC812            
-51BFF4B5            
-CE089E79            
-CE4BADF4            
-75879327            
-C98B6178            
-D7B1E852            
-95D6767            
-1283D091            
-20F90A2C            
-9020BD75            
-504D84DD            
-D8982F3B            
-E41E0CF4            
-55F4FE2E            
-2097DB6F            
-4B8B7790            
-F3A1E487            
-F4C274C1            
-3452A00A            
-15587F21            
-687D0671            
-7EB3715            
-945B9A90            
-8C83F0D1            
-8934F9BC            
-38A50D8A            
-7EF49EB5            
-A45D34E3            
-6C014201            
-D4D19185            
-821E216B            
-569485E9            
-6DCC7357            
-7711858C            
-852AA907            
-591CCDF4            
-775E7DDB            
-9463CA74            
-DFF1EFEC            
-1F60E4B            
-2628AEE4            
-EC89EF52            
-49D232FB            
-E8BD7DD1            
-EED418A8            
-C35E3A33            
-5C739CE7            
-979E4B23            
-B386E4FC            
-62F98F10            
-2FEF090            
-599508E2            
-F3F9F428            
-17A18287            
-639B700A            
-AA9AA4A6            
-B1AFC9E7            
-FB6E8D34            
-44F6A6D9            
-EEFB7788            
-9D616EA3            
-78F3BDCF            
-A5E71361            
-1D25ED7E            
-9059ACA7            
-89118CEB            
-BDE78C2E            
-55B9E0E4            
-FB6B9A            
-2DBAC44            
-85C0DEFA            
-1E222914            
-2413FBCA            
-C8569486            
-E757EC3C            
-5ED9DB70            
-3EA2086B            
-F4A4057D            
-E29E1B00            
-C271490A            
-525A60E4            
-9A286CE0            
-61A42BC0            
-D3F6ABE4            
-9F31FB75            
-335ADC59            
-9EA61808            
-232ACBB1            
-270C7B13            
-6EA6535D            
-F1D1B1A0            
-AE9088BE            
-D9E4FD87            
-3C8C0972            
-5EAA57A            
-26997EF4            
-3B02B885            
-A4722715            
-434BE51C            
-495165DA            
-BC9FC978            
-18D8C1E            
-328203FD            
-12643D32            
-65EFAAAF            
-71297EEC            
-EF8496AC            
-E5B7BF16            
-2B2C5A0A            
-86B713DD            
-101E03D1            
-14F4FB7E            
-34EBDF2E            
-2A9F4CF5            
-7143B386            
-448716E5            
-C61C8469            
-5F9F797D            
-6A89B910            
-548E4139            
-C48968FC            
-11F52973            
-E18DC2B5            
-7EEDA069            
-2EE38156            
-B8F99E97            
-E066E1BB            
-ACC5C04E            
-6E645848            
-98CA4890            
-78191984            
-84EC83C1            
-C58D9987            
-3AA63D1C            
-E17CA75A            
-CF8B5E23            
-155BC19C            
-5809C3C5            
-E2A7DAE3            
-D55C1B6A            
-585BF6D2            
-5D192255            
-310467FC            
-ECA8FE97            
-4ACDBA8C            
-E6319F8B            
-FD4F3E85            
-47FF7B0            
-B6FA3B69            
-D75D49C2            
-B831D3F4            
-1D6282B8            
-E335FE0A            
-C955B98D            
-87968F47            
-B9600C1            
-805AB6DD            
-2677ED62            
-86AA7680            
-836DD1B4            
-82C073FF            
-F2664656            
-DBE8C3BB            
-E4DA24B2            
-AE14BE60            
-1CF178AA            
-F2C661B            
-9ED5C4B4            
-3B67F448            
-426F85E0            
-40195BA0            
-66BDEE57            
-3A128638            
-A48D546B            
-7DC7834            
-C7706566            
-1E23F578            
-CF55EC28            
-F46031E2            
-CFDD3546            
-6CD58E9C            
-C40E02C2            
-19558D54            
-46E056B2            
-C1581093            
-20C057BD            
-34695F72            
-1C4B7B13            
-2FD3155E            
-152F2F86            
-189E2F15            
-31991472            
-1B85405D            
-D1F72A1F            
-8AA93824            
-CE409894            
-9F6D30AD            
-E72C6DE5            
-A31CC799            
-694EB42E            
-C2D96633            
-7F4776D2            
-509C0781            
-6A84F278            
-E11739F5            
-CC5EFAC4            
-DDD81D37            
-6960145A            
-E40C5DEC            
-70C068DF            
-1E6CC338            
-592EDE93            
-A19B8534            
-DA27B1C9            
-608D85FD            
-63AAE798            
-509A13B            
-BAF29F05            
-69342538            
-5A2FD47D            
-5FA22C82            
-AC7E3397            
-4E546537            
-4611C427            
-DA39FAAC            
-445F1CE8            
-5BC83B69            
-64AB6C7D            
-F2B4EFB5            
-DC0016AF            
-987EDDC1            
-3354C952            
-A5B9ECBD            
-E5B77548            
-997279F9            
-7C460F6            
-82A1099            
-B7CF0472            
-ABC3726D            
-DD4155C0            
-319B8C50            
-CAE7E88C            
-910F1C5E            
-B1367D8E            
-56B78305            
-8F4CB7A1            
-8765A3AA            
-89624EB6            
-22DE29BD            
-A12D4C67            
-6BC56ADC            
-B587BB0F            
-3806EC0            
-3C269C48            
-9EA289A3            
-B5EB4FDF            
-1ADB0729            
-A991429C            
-CE574FF8            
-CF071DB5            
-CE0D372F            
-3D99AE5C            
-D6D56E7C            
-3A493434            
-86AC7C63            
-FAF8B585            
-B9F1994            
-89CB3A3D            
-7C8974F7            
-2169640E            
-D74D62DA            
-8F0D850D            
-3B9D0225            
-4E2CBB6A            
-BCA7006            
-9DCE6E7B            
-3695D660            
-EB344960            
-F3D223F5            
-6B8CA588            
-45744961            
-2F493968            
-E9CBD376            
-9B0FDE95            
-F17603FE            
-B0825FF2            
-5B1CCD35            
-6F98639D            
-5CBBFA88            
-890B3C42            
-2DD4CA67            
-DC9513B5            
-A7B91C22            
-83A897B6            
-399ACDEC            
-AD11B2EF            
-11D76C5E            
-E170FB03            
-9326B999            
-87845BB9            
-CA14B73D            
-943FE9FF            
-341ADB81            
-D800A2CD            
-A7265DEE            
-1E7F3F7D            
-8AC49BD1            
-CCE49B1F            
-58764B66            
-D57DF0D7            
-229BE279            
-42DB683C            
-D8530314            
-F1FE931            
-DE1A4EEB            
-DF35B43B            
-3E90F80            
-B3934E4A            
-FD658EFA            
-E6CF1CFA            
-472B47E9            
-20F155AD            
-77571441            
-9FE03233            
-8BC0043E            
-80E9B238            
-D325F7D2            
-F0333147            
-FC86E62F            
-A5451DCE            
-D9374B52            
-674D4083            
-9952E9AC            
-B529BFF5            
-B7E072D6            
-5BCD2886            
-8381AC4            
-5CD6C7FF            
-F24E3549            
-9EBB5EB9            
-23F47A79            
-49D578D0            
-6CA5874A            
-2F3C83E6            
-D975C720            
-FB484F11            
-3BCFB5C0            
-3A66DB47            
-B3BB4F33            
-D5136C2            
-D4AB89C5            
-8A782859            
-C8FE9ADA            
-B5D57BA5            
-9C8D2781            
-7D0919B5            
-D362A6D6            
-1006FFAA            
-3BB31D71            
-7709BEE4            
-8A348C59            
-44A704D7            
-96F2AFF3            
-592DF706            
-F3247289            
-3E9BC2A8            
-570D8349            
-2F615AFC            
-B3802616            
-B54191C6            
-DD155718            
-455945B6            
-C74C7DF8            
-232005C5            
-6185D2D2            
-8FACE1C            
-73D27EB            
-770D2680            
-DB913D28            
-90FC0FA5            
-9DE358EA            
-2BD3287A            
-D5C8095A            
-DE541F30            
-D10F0F61            
-4657627D            
-739F2E93            
-F9F7B479            
-DFC6490            
-3D554A13            
-D3C6C2EE            
-80145765            
-D601408B            
-52EFFD8            
-A44B597A            
-9E65E39            
-2A5CB536            
-A0420638            
-EA752AFA            
-A7DE4743            
-18480882            
-A559B83D            
-2DC4B6C            
-8F33055B            
-7C4E3B8D            
-52C7F9F7            
-9FFA0A63            
-A0413C90            
-ECA35002            
-AB4A7AD9            
-A829613            
-71904BCD            
-9560A35E            
-118EC2D1            
-CA730775            
-A631E447            
-F526588            
-C415CDC9            
-DE509745            
-C2C64E6B            
-4A3350CF            
-CB04DB23            
-8D3BA4E2            
-3FC18EC6            
-C8CFB2C4            
-C2B600BF            
-FE36BBA5            
-EB4B302E            
-F2BD24D2            
-A820E2B0            
-DDE54189            
-744E33AA            
-9E63B141            
-21C2E601            
-2C12D5AF            
-85AAD794            
-EE1F97C2            
-9096006            
-14132FBE            
-FDDA365D            
-E3623A52            
-9F52F94C            
-18F84D8D            
-F866F6EB            
-9759E208            
-38195047            
-E31F1936            
-9D7E9182            
-CEC2787B            
-975EB96B            
-12F202B            
-CA36D8E3            
-A694168A            
-F033E484            
-DAEA79C6            
-C465D02A            
-154EBBA3            
-FFE408B5            
-977F7FD7            
-59992C2            
-72DAEF3B            
-47AD9078            
-11CEA76E            
-3B88B352            
-BA2FF2D9            
-2A7F4E47            
-DD6B398A            
-164FCDDE            
-CB7284FE            
-9FCF9606            
-34406791            
-104CC89C            
-A2F32BB7            
-213E9CB0            
-1E1E0B37            
-7226FA86            
-20502886            
-4C1C9E90            
-2D4D0ADC            
-D843214D            
-57730409            
-614341B4            
-ECF30446            
-330F5216            
-5FBA2C4F            
-B4102EF6            
-D6129240            
-7D5DFBEA            
-EB01FCDB            
-7CA7342            
-46DFED3F            
-5BE1B2D8            
-2F40EF9D            
-59622E77            
-A6AEA365            
-78133A87            
-7FEF9106            
-3956BCC5            
-8C6509F9            
-79525FD            
-D3A518F9            
-A76193BA            
-3F552EED            
-F974C309            
-12A5B04E            
-A71DD6D4            
-D9FE2B7D            
-95F822BA            
-EDBE32B0            
-92BFA916            
-79899BA5            
-3FBDC933            
-BC0E7C30            
-6D7FEA47            
-1F1954E            
-4F2F17AC            
-F6EA71E3            
-B8E34FFE            
-3BCD8BD6            
-695B7934            
-D4CE8358            
-26B0699            
-784EC0DD            
-625BC98B            
-8861D087            
-44DF0DE            
-35B7517A            
-A8FA9A12            
-244B927            
-AF7A58C            
-BE48CF00            
-95C13C21            
-9D8DBCFD            
-AE8B4798            
-ED04535D            
-47A2219C            
-C8B87734            
-8355D2A5            
-B4127CD6            
-DDA3394A            
-36846F2C            
-F38282D0            
-177D3FF5            
-EE8924CA            
-5E6CB3D2            
-1F6C2C7F            
-3EACD843            
-51A77194            
-51D89AA4            
-DCC17C24            
-DB5043E9            
-25D52B74            
-1C7176E2            
-1F483DAF            
-24B587EA            
-6188E94F            
-C886E2F7            
-7B24254F            
-A761DFA7            
-357C70B5            
-6BC46A7            
-31B8CF7C            
-BACB7205            
-6C1B0387            
-50685794            
-7726ACF            
-64C49E4D            
-7AF06B7F            
-D1F2AD02            
-E4F5BB37            
-2A8A4925            
-4245E047            
-B7CD8000            
-6C72A8DD            
-19590349            
-7F7EDB49            
-5DAF5458            
-5EEBC5E9            
-6E84757D            
-AD3868FA            
-F85A2B5D            
-A8569A1            
-88F1F6BE            
-AF363178            
-D9A61BFD            
-A2959EC8            
-C1343E46            
-B34A697B            
-22530AC3            
-70213F56            
-1DDEECA5            
-4DF030F3            
-78A4B8E6            
-F93B20A6            
-27AB7A7B            
-F43A2969            
-AEB9E421            
-75A8F820            
-52CD9316            
-CA166F29            
-C28D14E7            
-51E4C76A            
-50249FCB            
-3EDA432D            
-C6C3EEB3            
-6CFF2A56            
-5B50A9CE            
-D2CEB19B            
-2F16746B            
-1C19CB24            
-9CD2076            
-3F804860            
-FE59323F            
-62F1F95            
-2CF56FAE            
-E1A3437E            
-973F442F            
-DB62AE6C            
-C0AA4F87            
-67224779            
-A28378EA            
-6C5BE4D5            
-97F75FF8            
-49922E2            
-19ECBBCB            
-C89000E7            
-436496D2            
-29C94230            
-21A4D75            
-3DF46E1A            
-A6D150BF            
-4EDE1CCF            
-37A996E3            
-B0F73D3C            
-33E41F15            
-14076103            
-7BC6082F            
-E98E377E            
-1E787464            
-16AB93F5            
-B8E3ECD1            
-4A944320            
-41E77D61            
-8B669E91            
-20F1F65            
-F4D26572            
-81D9D4AD            
-99843F88            
-7066E60C            
-4D6B9549            
-C79BBF94            
-F53252E4            
-EDB94B9F            
-EA504F01            
-9BE5AD3C            
-98F301D4            
-C1C0ED35            
-3F2734C7            
-76351C26            
-AEC02AAC            
-B9D4A014            
-A01F14A1            
-2DD27A90            
-27C43590            
-5A06F84E            
-64CC23AC            
-76387C33            
-A07A8306            
-3BC362BF            
-5ED88200            
-CA6DC828            
-4DBF3E47            
-F633C85E            
-96F44176            
-76B2A46B            
-CF414D71            
-AD77A07A            
-9A1F71BC            
-FDEE86EE            
-7A8AC33B            
-AD3C257D            
-BEFBD214            
-5B562E2C            
-3527654F            
-FAFCD066            
-575BF8E0            
-BC2A071A            
-C903C2CF            
-EB1AB30            
-7B8C7CA1            
-5ED6E493            
-E1C822C6            
-368B9DDE            
-91122C29            
-5B1358F8            
-6DCADBBF            
-ED845AC            
-61E42CB5            
-732B420B            
-39154876            
-C10442B5            
-E1CC1A11            
-875215B9            
-AE9E4FEC            
-B2435F4C            
-DBC844A            
-10FDB0DA            
-F85D3FC4            
-608B78A1            
-DAE2B7B2            
-DCD08039            
-CC0962E7            
-10602FA7            
-62522FE1            
-D3AFCD9D            
-2882BAA3            
-70C31CD3            
-A69E9A2A            
-975BB834            
-2A35C91F            
-5FB2644F            
-69B2BF1            
-9C365DDE            
-E4199E06            
-ACCF8904            
-DE105FEB            
-9C07AC45            
-F75CF55            
-EF6E3E9C            
-1FB088A2            
-9A93BA86            
-4E91C403            
-E07827D7            
-5F7593            
-FC778EF4            
-5B831E07            
-354A60B2            
-8D39DB34            
-5C3C16CF            
-38489DCA            
-D83EBDED            
-F9E5BE76            
-D2C7FCF3            
-E868A2FA            
-D29E98A9            
-5AFBCA1A            
-D01628BF            
-B2334643            
-4EC99A5C            
-189E9585            
-CC2B18FB            
-C692AC25            
-A7F6B978            
-C1530E03            
-AC815E6            
-6304151C            
-52EB83ED            
-C4921682            
-96441A15            
-56338D69            
-5C82292            
-FCA308FD            
-978D2310            
-192DB3D1            
-CA6B9EAA            
-7AD9F05D            
-E7C35D2B            
-AB5505FB            
-3DD6013C            
-532AAD00            
-87EA4F8B            
-1AC88F4A            
-4BFC2053            
-65356D9B            
-B03A54FF            
-6F585110            
-2C75F6A4            
-CFDC2733            
-3E7BD30C            
-2DE068DD            
-F318385E            
-26CEC150            
-532C4D5B            
-B264C41E            
-46229E71            
-39E85376            
-A074FDB6            
-461E84CD            
-BADDA454            
-77D4AD4E            
-479457C8            
-F0E4F65E            
-DBA7730A            
-24D4FEE1            
-9442683            
-7725F0EA            
-F8647367            
-5F4D5208            
-6DC11B5C            
-4E65BE22            
-EC0713FD            
-1D54F605            
-4B0F99DD            
-E585AB57            
-E14C5EA4            
-B7909465            
-12ABA66C            
-EEF519D            
-62F4CFD1            
-48DEF31F            
-16B38659            
-5528B313            
-5C031870            
-87ED6DE1            
-55ACABF2            
-FACEBE99            
-3007B9E5            
-F5C0C90F            
-E97F9A15            
-951AE375            
-67E41B2C            
-CF7F6BC3            
-C7836B7F            
-88B077DB            
-DA60BEA0            
-1FD6BE04            
-95A08F39            
-B7EA73B3            
-10F6685D            
-A9C04118            
-EAC17020            
-CEEDC89            
-7EFB007C            
-8D900B82            
-4C2BCF1C            
-9B9BDFC5            
-28846A96            
-139B4D19            
-32E0786A            
-72F19BF4            
-66D61EB0            
-609F7568            
-3A785E09            
-B6F2294F            
-96E73FE3            
-99A0812E            
-1BBAE42            
-9DF477DD            
-111FF2F7            
-8A882B32            
-2542FA4E            
-7BEAFF22            
-405268CA            
-2427EDE6            
-7D9F0726            
-7EF6ABC7            
-7F8DD904            
-C3F2F4AB            
-213FB22D            
-62AD3732            
-955CA4C7            
-9E83055D            
-BE9C70CD            
-C0E6DDF0            
-892D1B64            
-56F3A648            
-43547D3E            
-35EB967E            
-EBC18CA5            
-D4DAC35A            
-9DDB564B            
-6DFD4F07            
-CB02555B            
-425A1595            
-B978D512            
-B3D78E9F            
-A3EA970F            
-8E27124E            
-6A57B7D            
-26D405F2            
-C8A1CED7            
-7A6338C            
-A497AA49            
-95602B8B            
-C6F1583D            
-CF5B6A58            
-81F2D693            
-A34B3C07            
-B7180B4C            
-46C6E5CC            
-8C3736E9            
-980482E6            
-8A34B532            
-B698520A            
-20E9DDDC            
-A5D8B27            
-6A0B3989            
-10071434            
-C82002AE            
-8A343B26            
-2FD61FC8            
-C1257546            
-FF154858            
-1AFEAE33            
-C2B1532D            
-D979A2DC            
-93F9FD3F            
-769B0DDF            
-4132C851            
-A372D4CC            
-6A5532FB            
-E8F203C1            
-A421B3A0            
-B50F5C9F            
-AE5B067F            
-8CE6F896            
-8BFFEABA            
-B0CCFB51            
-D455681E            
-FDEEE781            
-A4873A97            
-E3FAC8DA            
-5039A29            
-C703A1CF            
-E4E29AEE            
-39C0B0DB            
-DE5756E            
-303C7D43            
-586246C            
-41ADBF9B            
-D1CD7207            
-3BC8FD94            
-7E50A650            
-390914DC            
-ABD6170            
-ECFBE529            
-3D51360            
-569802B4            
-25F255D            
-1523D176            
-9F98AEF0            
-9DB1B681            
-DAE01D8            
-46D4F7B7            
-47DD8DB6            
-23BDB9D8            
-90C47F30            
-998BF564            
-5D60F7E4            
-309B5851            
-9D246C3            
-C1895130            
-1F918DFB            
-6F303265            
-71E0D0A7            
-77F2FF64            
-589BBF0D            
-A25C4510            
-9F05AB6E            
-4990B583            
-D335BD7            
-6CBC0400            
-D7894817            
-36176CCF            
-1C6A98BE            
-53EE793B            
-4003C3B3            
-9E46BEB5            
-57647A51            
-D5599FED            
-38156D3F            
-B1F425B1            
-7AD6402D            
-74B619BE            
-A11B18AA            
-9C4211AF            
-DB076668            
-7A94C4DD            
-6833F9A5            
-A088A4AE            
-6A70BAFA            
-BC6740FF            
-B7F6508A            
-F3BAF225            
-29BF8108            
-7F074F1C            
-18B3D5C1            
-8A948077            
-BE0483D3            
-46B195FE            
-D7AF0FD0            
-C31414F4            
-B5BD4871            
-CFAC4C37            
-57D2D42C            
-10A73F90            
-407A80A8            
-21C50A11            
-22E165A0            
-8361F9A8            
-EDEA52BD            
-28F3650D            
-CAD63254            
-9AB9033E            
-82BA1020            
-E6E6A470            
-9C829847            
-BC3AB877            
-A91A7C99            
-1ABAB07E            
-583AD9D7            
-9AFA901C            
-9AE116AB            
-27B4F5A6            
-877D0225            
-92DEB3AB            
-BAA1506D            
-EB04B325            
-C275FBF2            
-2331B6DD            
-74F623AE            
-933EC4BD            
-9470C6AF            
-6C0828EF            
-AAC0532D            
-318961A            
-29C176E6            
-4011BAB1            
-895DF78F            
-410AD703            
-F363E54D            
-B4913DBE            
-6B5047EE            
-E7099A72            
-E2961301            
-E587CAE2            
-1449E31A            
-EB048AC6            
-D21BCEF            
-EACEF00E            
-EF09B5C6            
-2C050BB2            
-D660ACA0            
-361BA74E            
-26D1A92E            
-10F1FD22            
-DAD028BE            
-5DDB96F4            
-A1C8F873            
-66F44797            
-DD6019B            
-618F707A            
-4E4525A0            
-551B89EA            
-6A93FE33            
-8219D90A            
-5E3E3FA6            
-C9C25F24            
-D4593D42            
-CB12B9FF            
-B09814CE            
-DAF289CF            
-C59234E7            
-6C96C435            
-1E7337A5            
-FE315E60            
-451A4E00            
-CC3E2B8            
-EB1AABDF            
-B2D1AD85            
-2A12A008            
-B525A4EA            
-ABE700A4            
-80603A44            
-3E2E49F6            
-48630509            
-9673204F            
-7B0DEAD3            
-B0B2B6D2            
-68C0453E            
-BA31833B            
-4BD68812            
-C64D0638            
-A8987E25            
-48850A6D            
-9B337E66            
-1D99461A            
-D47AE0D1            
-2E3023F7            
-29CD452B            
-A211306A            
-15CD90B9            
-D5D57C24            
-727FA881            
-51316FCD            
-BF62F735            
-9E67B311            
-51A2B90F            
-CF7C9936            
-A537087E            
-3EB2EE91            
-8F4D2C93            
-F83E1906            
-826C14F4            
-6CBE676            
-ED2DF931            
-38270781            
-4C567B1E            
-96BD9972            
-E089656B            
-7DD03E9            
-534E777F            
-695B12CF            
-338EDC74            
-D5E3DFDD            
-13937C2C            
-A386AB68            
-CADAD94A            
-B624A652            
-9E4D0656            
-3BDD26F4            
-8B9D1ADD            
-180D5005            
-E8744FCF            
-6CA71503            
-20697624            
-49269DB9            
-B27B12B1            
-AC181CE2            
-9289684A            
-E5D3A21F            
-6A79B5AE            
-EE6DD5DE            
-355DA7A4            
-C5B13162            
-5FFA0324            
-602F32A7            
-85BA4032            
-DCBEE18A            
-D76BFC80            
-4B72BA0            
-4101BC2D            
-A3CB1CE3            
-4C6262A3            
-59198E3D            
-AAD7C84F            
-4DFE129E            
-E8153DB5            
-66EA03BA            
-D3247EB4            
-750DAFC0            
-68FB3A27            
-67005B98            
-C2255031            
-1D9106CC            
-7FD4C833            
-491CF81A            
-28D5F0BD            
-E2275FB1            
-762FF58D            
-D9D940D7            
-C6B5CBDC            
-810E0D6B            
-DAFD7E89            
-15C3544B            
-D7B6A237            
-3DA125A3            
-3272795            
-A7BCF9DD            
-4FE52CD5            
-3FB69C23            
-4F106EA9            
-3632D2EE            
-9DA08D3C            
-5282D2C7            
-9575F24E            
-D390A80B            
-2897EB0A            
-A4B9FBE0            
-DA3FD83B            
-EAA2A95A            
-73FC7AEE            
-CCDBF4F9            
-3EA97EA4            
-A8AD7E75            
-C533A490            
-3FCE73            
-D451BBF2            
-6A71BE12            
-76E1EC5A            
-1845E1F8            
-CD2B7C0F            
-4D92E7BD            
-81B44E4B            
-65E1B458            
-6B69FD73            
-86CE76BD            
-88B1CA29            
-EA1F0D7F            
-43D393F9            
-C85E394            
-B5C665F0            
-AE373F77            
-46196293            
-E6057838            
-7C63A634            
-C3F66075            
-1F15C3E1            
-ED457843            
-83F9BA3C            
-D8B8A399            
-852DA2FC            
-3B81F785            
-DFA3848            
-877B985B            
-1C82BEF1            
-6482EA27            
-A4F94E9D            
-9FB72748            
-47CF963D            
-C514BF88            
-4D4B79D            
-232D2991            
-3DEB3B5C            
-49784213            
-9D79AAEC            
-EB89F7E9            
-B9F9993            
-71528CF1            
-E1390DCC            
-F4655453            
-97847A30            
-3C30D55E            
-72649CB1            
-F0647A6            
-C6C8AC04            
-FB48D1A            
-39EA9573            
-70C70D43            
-3F6BAD93            
-342ACF49            
-F37B506D            
-EE64D0B3            
-4DC05CFD            
-79E116BD            
-5458D922            
-3957971C            
-970D89F1            
-9AF398C7            
-A9A651DF            
-D3A64902            
-27339129            
-2FCC3329            
-B1C70D5C            
-3FCCAD9E            
-C10A34            
-80B546E            
-7EC04275            
-512434B7            
-526742B7            
-E96DE8A8            
-27CE6F9D            
-FD566C7B            
-8DB1FE12            
-93F810FE            
-C660877D            
-348D5704            
-BB3F2FD7            
-9F859C53            
-907BB57E            
-318DA95D            
-BF1CF416            
-3E8BF68B            
-BB8CE4F6            
-A9954212            
-D1A396D6            
-C33F5A44            
-2DC0A59D            
-5B66EF45            
-1CB288E0            
-D6874F40            
-E275F00B            
-E6B62E72            
-6BB1EE97            
-389CF9D6            
-8C093ED1            
-D4CB36E1            
-12F4840B            
-F18A2F83            
-782EB525            
-12BFBACE            
-78F772C4            
-91988F79            
-55BE57F8            
-6605D204            
-5A7471F4            
-355005FE            
-267A8C9            
-CAB49590            
-9479E9EA            
-BEE93B2A            
-34E95C45            
-61788682            
-6B99ED61            
-33D4D3D8            
-DD149E5D            
-D3BED775            
-287B4087            
-A2552A0E            
-477D609D            
-96765321            
-2696E220            
-3B6E26E8            
-5CFFD0A4            
-FDBF561C            
-4C41A4FC            
-B0637D44            
-85DF60F0            
-539171DD            
-9A1D1F12            
-72ADB48A            
-D8C0C9CB            
-E4FE15BC            
-24EB5C50            
-E1A9B3DC            
-360563C8            
-F20C02CA            
-E9FBE774            
-B2FEE97A            
-EF34194C            
-6DA8A0E1            
-ED9FFA1            
-4EB5D717            
-47D296E0            
-FA147414            
-C1F868CB            
-761182D1            
-6B9F8311            
-7A99903C            
-95449FC9            
-A349B21D            
-F2AA6E8E            
-CBD733B            
-1EAA2224            
-C7CC9CD1            
-DF3D1C7F            
-81343E5            
-30682CA5            
-65C5BDFE            
-811D5CC5            
-8D2DEF35            
-D8B4F4DD            
-9E121109            
-FCA97592            
-99E76951            
-7CFB5D            
-8489CBDE            
-D7A8D721            
-ADD1A5B5            
-4A96DA59            
-CE6C2C78            
-17593D2D            
-F94AF7BA            
-6CE767D0            
-DBCEDF25            
-43629583            
-CDB11A86            
-BB630047            
-8A579D2A            
-FC17AF19            
-ED54597D            
-9BCAA00            
-B7865C74            
-BADFD092            
-9AB0AF05            
-AE371DB7            
-EC0EE641            
-A9781E96            
-D1B8A429            
-FE9A2043            
-BA4C2CC0            
-F243E36            
-78A88066            
-70925DF6            
-97A35A05            
-F18822EB            
-212A79D            
-666D7F82            
-4558A3AC            
-FCF953EF            
-F8C6DD4A            
-C535BE4F            
-973A007C            
-4DB7E662            
-C8995287            
-B3527C60            
-FA4F7A3A            
-D417AA12            
-D861531D            
-11A81498            
-5072EC65            
-5886C667            
-7EF848B3            
-CA4ED80C            
-3DAEA7BC            
-34EC1028            
-349C86EB            
-6423A583            
-22A163C            
-339CC766            
-E93138FD            
-7A79EA77            
-E480913            
-1220E06B            
-65ED8DDB            
-ADF487D5            
-82CAE485            
-A88E6546            
-3A7F5961            
-4672ECFA            
-425EB8F            
-AA3C4450            
-44CA10FA            
-B1EAA942            
-9EC93584            
-E417CBF4            
-B5F4C488            
-EAB1DE5C            
-10446170            
-C5F9C89A            
-391EF7F7            
-10C62C73            
-817FC74C            
-DA1A9F17            
-FA38D673            
-D2026552            
-D7CD67A8            
-4E0E21A6            
-56812AAA            
-1D7294ED            
-575452A3            
-90581C22            
-82E00D73            
-A8FECF07            
-1CB1E500            
-7F51D70F            
-F840E8D4            
-DD73E72F            
-8DED415A            
-3F029F0D            
-C9CC871A            
-3388492A            
-AA1DEF8D            
-F2E93846            
-F9CC596            
-48221BB4            
-6F7B2734            
-F5A1010C            
-C0FB41C5            
-8693416B            
-C8EAD749            
-21ED8A7A            
-9FF52520            
-613635AF            
-92C5E0FF            
-435C33AD            
-2550A70F            
-B17B7FE9            
-9CC5F28E            
-690D4EB3            
-5C5DCAC4            
-25E14191            
-B03B4C07            
-50DCF2C0            
-499BCF9A            
-5CCD6CF1            
-ECBB2C48            
-A2990792            
-2105FDBF            
-3D62BECB            
-493AA5F0            
-2CF5BAD2            
-DFF53D23            
-50D77C82            
-35CDBF8D            
-E3BD4C29            
-6A2FC510            
-A9B2D0FD            
-404B053E            
-BF548C52            
-E52081D2            
-AD550AB1            
-D4316A79            
-776E6C42            
-203A4395            
-54DAB8DE            
-EB67FB95            
-46E34074            
-21679614            
-C395F6BF            
-6D513D56            
-93DDFEE7            
-7D2866A            
-2283CD12            
-12789536            
-5C1F1037            
-4170B23            
-8BB451B5            
-A9915ACA            
-784C0FE1            
-50A95654            
-CB574A            
-8A1690D5            
-D9753D9A            
-3084718F            
-8E429880            
-D1B7693E            
-A7613422            
-C1707E97            
-D658E57C            
-1C2A8F42            
-21BE34EE            
-E545D5C3            
-23DF7522            
-B7AD16A3            
-C6E7279A            
-2AD251D            
-FF0BA8C9            
-E586EA40            
-D86C394D            
-1A0D6737            
-5AE27469            
-8A0F53FE            
-1A0DC5E9            
-8A56C2C4            
-AD3214FD            
-DD999E92            
-E53F55E7            
-5AB39BDD            
-119C7046            
-19B8238            
-E21A4F81            
-5DE3F0F9            
-BFB5E145            
-5020F616            
-C2794F78            
-9B7D9F3A            
-8FBBF3F1            
-1D9C111C            
-49FEEDAE            
-1C83E386            
-BB5B0273            
-C290FD8            
-52C788BC            
-86C12DD3            
-6608E8F1            
-313C6430            
-142570B6            
-F75B9552            
-C8F1E8B8            
-F3E5AAB1            
-9E4D9E8A            
-7E48E48F            
-2182FBF            
-F21DC3            
-BD6E45C0            
-8DC88EA2            
-D5B67DA1            
-C592692A            
-979B0A6B            
-783D09B0            
-C2231CCF            
-5CBB3057            
-4C10986F            
-3F738112            
-BED7BBF2            
-A2577A6D            
-13128005            
-3C71262B            
-BC8E920B            
-40C44CC9            
-C6C4B496            
-5AA9CBD6            
-C7A9741            
-2A8EDC58            
-D2253A26            
-F343439A            
-13F71CF9            
-A4BB5CE3            
-FB52ADA9            
-1AF0749E            
-ADABA787            
-C22B2194            
-C5132023            
-846C2188            
-33A64D52            
-E5CE9022            
-CAA4C044            
-E7032B82            
-30251130            
-22463302            
-954AA98D            
-52D6F132            
-11E0FDD7            
-D62BAE17            
-9844BF8B            
-68ECD60A            
-E637BA92            
-1D7BA1A7            
-F091F891            
-CC96CCF3            
-E2C50AF4            
-149FAA77            
-F16F7294            
-27212569            
-B96E1119            
-E7806734            
-15A5818F            
-4E05DAF0            
-F022D5A0            
-303D930            
-B92CF71            
-377DE596            
-8835F16D            
-2D0B6E77            
-2A89FF6F            
-9EA75369            
-FCDF31A7            
-8F674B8            
-34D270E7            
-BFE6FD70            
-F165A645            
-675B8D2D            
-318F8DAB            
-9F52E28A            
-A464F277            
-B998CE45            
-9E932DF9            
-2918A97F            
-EA5C5130            
-952FECC3            
-7DCBA50B            
-DEE7C01D            
-96B96F4F            
-1C6106A0            
-85A1AC4E            
-D62EECAE            
-6387F846            
-271EB1BB            
-E1A2582            
-D1E03035            
-9EC6EA57            
-300E10D3            
-CB91419            
-52652E8            
-8291BE30            
-E1D52680            
-5044FC2D            
-35E58D3F            
-C6A01A83            
-814DA7BE            
-97A50A83            
-DB801411            
-D4C43BF3            
-BC3D29C            
-E4A072E8            
-6F51D4C3            
-21A5886A            
-F744A91A            
-5E12BC21            
-F86FDFF8            
-C320E6BC            
-3DEC9656            
-F89A6364            
-F668339E            
-44999436            
-F40A8A0F            
-71837448            
-B09D47B3            
-2D2CAB19            
-3FF04F12            
-D8E5CC71            
-33F39593            
-160D74D7            
-FB841949            
-95F0E78B            
-B9A6102A            
-A4D3C679            
-4774D90A            
-AC55693            
-8F3CF617            
-5BDA2B57            
-A548BA77            
-B1158C29            
-FE9A4D00            
-B52446D2            
-E6DA1712            
-3EFF4A4A            
-41EF9936            
-D65FB56B            
-E3AED57C            
-BFF89053            
-192E499D            
-DD703817            
-C2B8C9A2            
-65A8417            
-670D3446            
-2E936BCB            
-8A14CEFA            
-CF71A41D            
-842BD0E9            
-628148DC            
-9733E864            
-1C57CF93            
-1A0CA311            
-A1E13B05            
-2C8F3844            
-66C2361E            
-8981A417            
-A4668A3C            
-271048C3            
-6DD908BE            
-1A933D24            
-BD0A78F8            
-57C44DC3            
-1EE04ABC            
-32275D51            
-B25BCCC5            
-509C83A2            
-E5E1B85F            
-D45DFB17            
-EF39D3BA            
-4F4F32D2            
-8F1E52D            
-62A47A4F            
-7E4010A6            
-189250D7            
-CF3B51EF            
-5E9BE373            
-E9719F77            
-B2741A6D            
-CF19D7BA            
-993284DD            
-A1839978            
-AC00E790            
-ACD3A888            
-1E74292            
-6306A56B            
-F9EC26A3            
-9FC5BC2            
-2D6F22F            
-8CAAA98F            
-CD2135D6            
-D2F5CD5A            
-CFCC3D48            
-6AF7A18F            
-5A3EA067            
-8DE9498F            
-A279E5FE            
-8C1D89E2            
-5D15FE82            
-AB291798            
-40421279            
-E101CFFC            
-D2D0D57B            
-5C977DF4            
-68D4EF4D            
-22C36080            
-81526010            
-E5A41122            
-160C517E            
-8BDCEC09            
-5F12637A            
-F3714AF4            
-D21C140F            
-B1EFABEE            
-E49A3E48            
-E67BFC93            
-C4BE9508            
-21854565            
-60757AA0            
-FB5C43BB            
-150F6634            
-115BE267            
-3BE8F3E5            
-EBF986EE            
-BA18FFF7            
-82B52CF4            
-50546F93            
-118CCB96            
-AA6603F1            
-F434B7D1            
-FC356F35            
-C996ABD3            
-CC8CF7C9            
-4C2935D2            
-2DC9EB76            
-ECA4D776            
-5D2D35A8            
-7C747824            
-ECAA990E            
-A6078345            
-CF589355            
-7E9AEC63            
-859E12C            
-C2F31842            
-6563A3BC            
-D43FE9EF            
-39D1717            
-AB887505            
-1AADAED9            
-3D07A0C            
-7D2B456F            
-53C1B39B            
-DF349267            
-FD9CC686            
-5C1CB396            
-89DD96DC            
-A0D8DA69            
-F2A68012            
-7F40A406            
-1DBF2E24            
-B31EAEB0            
-5D5073EA            
-19C16D03            
-10E50F00            
-47D3D228            
-A3C0E13B            
-5E801D5E            
-C58677AC            
-F6E9095C            
-E2C0938C            
-14CB070F            
-11B98703            
-9FBA36D6            
-5ADB369F            
-681BC767            
-BEAE4008            
-5A0AE129            
-ACAD1673            
-F9992AFA            
-2CA14EAA            
-F77F77B6            
-2705BD3F            
-F9C3E6D6            
-D3ED854E            
-4A5FB85D            
-54187218            
-B9B8C83D            
-EBD38F57            
-C0D17CF6            
-8B464900            
-3F8D26CA            
-C0FADB4A            
-7F79A367            
-123EEC9B            
-99B683A9            
-157062A4            
-91DE43EF            
-65733625            
-56DC9E5F            
-2C88A8E2            
-83AE236C            
-DDBF0A9C            
-18873E45            
-5040B3D7            
-29927CA4            
-B5A18202            
-93CC4EA3            
-5DC2F698            
-A97A1713            
-A104C149            
-B9C5588A            
-AF182A52            
-CFEC25AE            
-CB1C0A91            
-143A132A            
-27C4A3B9            
-D73DB7B0            
-53AF7F76            
-9A614866            
-82A54DBB            
-D77A5A23            
-AE3FA285            
-8C2EEA1B            
-DD21D577            
-186EBEF7            
-DBACB855            
-18E30376            
-144A1FCD            
-773561F9            
-F18F3C71            
-4A13E021            
-8738BA8E            
-1A9FF053            
-56A546BF            
-860C6457            
-9E5F2177            
-B3CD57D8            
-7A2CAF5E            
-F8D57DC7            
-941CACB            
-E70A729F            
-7EDB09B5            
-E972B09            
-ADB7C542            
-3832A659            
-AF33DD9            
-152082D4            
-9A2A3452            
-70B5EDBB            
-C6549E13            
-D621FFE8            
-15152F3A            
-7781B485            
-67B0DEA1            
-C787B62B            
-75B9A705            
-C2A30FD7            
-41CF8EA            
-3D2B2148            
-CA0445C0            
-802799F6            
-FCBCCE57            
-F539ADB0            
-54952BE5            
-B343804A            
-25752CC0            
-3F276012            
-7228715B            
-7F61944C            
-DCB8676E            
-132DC654            
-CBA2782E            
-33016B92            
-30F194E            
-F2D953D8            
-15A92EA            
-495D2D8B            
-4366F311            
-8F8DC099            
-C4B2611B            
-D90839F0            
-CEDA9833            
-5CA78F56            
-5D5F4751            
-7F37FE54            
-5B8F6537            
-6B89CDD1            
-6728B0EF            
-D2BED44C            
-60293190            
-F41CF0F0            
-8BF08F76            
-861F32B8            
-2053AB98            
-315DF7D5            
-58BAE934            
-F38B7C9A            
-653396B3            
-E2152002            
-A4E66BCB            
-C1E3F151            
-AE7AF50A            
-545F0684            
-643CF8AE            
-BBC4B464            
-7B8F849C            
-334A660            
-3FFF02AA            
-7EFF666D            
-F80965DF            
-42D34429            
-B8037A02            
-36CA2FBE            
-539208E3            
-D03932C7            
-5C619FA4            
-FC641E3E            
-D01051F3            
-51DF9226            
-116CF628            
-8055029F            
-4A9130C9            
-5A2701CF            
-89251BD3            
-52D99785            
-B2C16C02            
-83581080            
-57D8A09C            
-6D551FEA            
-EE6334BF            
-7D8061F0            
-8556CEF4            
-D9418360            
-82DE39D1            
-AA9CAE96            
-8D3C1056            
-8C67B490            
-C7BA78F            
-D46697F3            
-879107FB            
-88F4FC5A            
-E7B0C68A            
-3BD94FEA            
-648EAA00            
-22724D11            
-B6F00ECF            
-488584F7            
-A104F52            
-FEE79F3B            
-689DBC3C            
-2DFDA897            
-411EFFAC            
-546F5C25            
-45562F46            
-C17613D7            
-40CD3300            
-9908DC56            
-5AE62418            
-4A3C1C82            
-A28631C4            
-4AA65060            
-5614DE71            
-6512AAA2            
-5AE841E7            
-B04094A1            
-AA8F8123            
-593A95CB            
-21919833            
-DFFAC729            
-106727F1            
-273A2977            
-85E6CD4A            
-E9751C6F            
-DC308E67            
-40F7722C            
-1D8986DC            
-489D6002            
-7A869A39            
-6E02A88F            
-A04E30C2            
-B98C740D            
-3672EB58            
-9702EBCB            
-2CD4FB56            
-A0CB2C94            
-47299608            
-6BB5451D            
-36EB4DEF            
-763593B9            
-40029F5            
-9392B153            
-777DA521            
-3125CFB6            
-E60A4DE6            
-98B9CB40            
-819091F6            
-83D23CD3            
-ECE09D62            
-22EE60D5            
-29A3F86D            
-797C0E72            
-1EC708F            
-76F78D62            
-E527F0A5            
-F11AD3D0            
-BBF11E9D            
-5E944B45            
-D090FFCF            
-4B8F7B5C            
-96ABDB47            
-2F5379A2            
-38FD509C            
-F49D4D2E            
-F5538B3E            
-BAD3E277            
-E9C9831A            
-22D3C209            
-CEE03CFC            
-EB55F3D7            
-C61B5224            
-6C4E6ACA            
-A63B52BD            
-695DBE54            
-3C68D8AE            
-847F8449            
-72B426E6            
-95642CE7            
-B021A768            
-AB094E2E            
-90D8A573            
-D3BFF1FB            
-460DD461            
-EF32D23C            
-868AEBDA            
-6BEC2EC0            
-34D18392            
-6C9D6621            
-6CE02624            
-75E6AE8F            
-B5BE7494            
-A033B3BE            
-EED6D471            
-99D40A8A            
-BC742254            
-530DDD69            
-77698872            
-E89F0ACA            
-39716DFA            
-C811D562            
-FA7770AC            
-1F68B8E            
-7D325ECE            
-8CD870A9            
-DE561FD2            
-8D49A512            
-979F1346            
-CBC53E73            
-E779994F            
-354561F2            
-ECDDE60B            
-52EE9980            
-46AC0C6F            
-555C8C8E            
-D382E1DE            
-2A9A602B            
-4F18FA80            
-96068D7F            
-D1E5CBFA            
-957912AF            
-DC0A3107            
-77CFB940            
-E7161980            
-EB44FE07            
-C1597F4E            
-FFE737C9            
-ECBD5506            
-AF75488F            
-6D0BB14E            
-9ED0A181            
-8EF54B6D            
-4E69EFD            
-9337A7B7            
-A880D3A7            
-97A5D09D            
-FD9F77A            
-7CECCBB1            
-2869D0F4            
-F1806C1            
-F9FEB241            
-7D368AA7            
-FF972C5E            
-FEA0C745            
-CC1413            
-DD4CEA96            
-FC8C6CEF            
-75727E51            
-5A17C784            
-422EDDB7            
-6505031A            
-5662B865            
-D7848124            
-A93A9AC            
-D874DF58            
-FEFDE7F8            
-5B3E37E8            
-5CDC346E            
-CAAFB037            
-BF2135D8            
-C6977D49            
-8D61C84A            
-C6B1C620            
-30AF013B            
-B98B3270            
-CBBE51A9            
-43E26F1            
-99534D9A            
-11DEC7C2            
-F3952B8C            
-52900E87            
-80D2B350            
-838A2A8C            
-F8BFC35A            
-AF0466F9            
-CCFC01C9            
-C4A559B8            
-5FED8BFA            
-ECB87D1F            
-7BF187            
-4662AA70            
-1274E59B            
-41188FCB            
-A769BABA            
-38F43333            
-D4645494            
-3E464034            
-6F3BBB27            
-8149A2D5            
-D3D96C7F            
-C04CB115            
-DE3B6C40            
-B94FC85F            
-E0E6291E            
-3E22885A            
-30D35E07            
-81014DDD            
-A40ED586            
-A713CBC9            
-7E0CC084            
-439FE695            
-F4094931            
-C293453E            
-741A83B0            
-D9C2E5F3            
-4E623673            
-309436D5            
-807620F7            
-7DE3993B            
-8F31B5E7            
-F12F65FD            
-66763A72            
-D3606695            
-ED7794EC            
-8BD7EF5B            
-5B3449BB            
-D9B93EBC            
-5CF89E53            
-103CE7A            
-A1ADA14F            
-BD020E01            
-F737C35B            
-8695E1B            
-2AAC416C            
-43B6BBD5            
-31036C5F            
-E5A61222            
-F3E01282            
-9A93EECB            
-BA874043            
-1D010D4C            
-3F45AF54            
-662F04F8            
-279C9BE3            
-217787A0            
-1D399000            
-6669B218            
-A8F4D699            
-181ED599            
-A584DCDF            
-97A49036            
-C5D4A8F7            
-3C7351B3            
-E4A7A0A2            
-9A13953B            
-A9649AB5            
-E9B91DF8            
-CA6E2F04            
-F0B63E4F            
-C0F55BF2            
-38EBAE63            
-8D8A619A            
-1A798058            
-E5C218FF            
-8B67C799            
-A81704DD            
-2562EF33            
-74B37ACB            
-B2C84D35            
-2E0EC87            
-5CAC361D            
-7FA10429            
-DDC1672C            
-3574275D            
-A831D84E            
-65339BB4            
-4B936FAF            
-8348EDC1            
-B1802336            
-601EDB14            
-BB5E4EC            
-48CE4DD2            
-4CC93BBC            
-E77987CA            
-6348CFF9            
-90830A68            
-1BF0414            
-C2BC8AF9            
-3EDED4A4            
-66B38B85            
-CD6A6E08            
-92B71F79            
-6BB2BA9D            
-B4EAF374            
-5B723892            
-C350B751            
-D7A56661            
-576B1A79            
-C66D8E1D            
-442DA54F            
-ED0C819A            
-809EBE76            
-413B884A            
-817EF987            
-D76CDB84            
-90F40F80            
-2BEB3E69            
-C2782488            
-F07FF38C            
-93AD0DA3            
-C3E8DFD3            
-5B804608            
-9CEFF79A            
-BC524335            
-495E18F4            
-7FEB37D1            
-A8F15A96            
-3AE50033            
-9DC5D0BC            
-D4A241D8            
-8F3CC38A            
-4573A224            
-5A3DA58B            
-B446C862            
-69EFCA93            
-83B911B            
-CD50A370            
-2E05D74A            
-407D2B79            
-AD108E34            
-95EA144B            
-EA3DE818            
-7AF026A3            
-21366692            
-4D5B7972            
-C7D14546            
-B6EF2543            
-48E7457F            
-6947E018            
-F6B2DD01            
-9FF698B9            
-EA11BADF            
-741FB523            
-70901C0E            
-6A71C468            
-8BD95624            
-1D98077E            
-EF7CE480            
-21F44B08            
-563A0A30            
-D9165A            
-7F8E8474            
-219FFBE2            
-FE1D6D6E            
-F7B8D66C            
-CA49F15D            
-C481484B            
-85D5310D            
-3FF17830            
-8F69C740            
-590A3DE5            
-867A85CD            
-21C9758            
-2E625FDE            
-7CD5B8DA            
-8BF43699            
-AA17B723            
-C0DBB2D3            
-617F6819            
-4D6BE357            
-A2D89B90            
-C4B19255            
-748BC770            
-4BA5F90C            
-2AB43820            
-CB75746F            
-FE7480E4            
-239B7D6            
-2567653F            
-7BD1399F            
-55A842E4            
-572D6A8D            
-CD1600C            
-6C880525            
-1C18F7EC            
-C9C74D53            
-AB3AB21E            
-F5EA5F69            
-F6F730D5            
-FA454FEB            
-978E940C            
-64D4DE80            
-2BB0D31F            
-10268273            
-D060E295            
-85A74B89            
-A7A3AE03            
-7B8883FC            
-D0615497            
-9D637210            
-105C40E7            
-F9FB184B            
-B4E67A79            
-373530B8            
-30E04C2            
-47A1D75            
-A6A67936            
-1B789F9D            
-AAC21CCB            
-E00A8B8            
-517BDE82            
-B1004DA3            
-3F745A4A            
-8FD0E21A            
-529E48CB            
-BE6AE2A5            
-DFD7DE91            
-145FF288            
-2B1AD7B5            
-C2AE7259            
-88B84292            
-373D8796            
-5E4B4FC5            
-971622EA            
-3C6F40B5            
-5FBCF21A            
-144B7DE0            
-C588DF6D            
-804B7F0E            
-4B6714FC            
-C1C2E61            
-1CB08E0B            
-6355112C            
-1912B0BF            
-22263C9C            
-954A5DE3            
-4520505E            
-459D0661            
-70FF554F            
-F1FED0C0            
-D1F602A5            
-AE5D07A5            
-B86AAF05            
-452536BA            
-B00C120F            
-1431099A            
-42F0959A            
-FF1EAB1E            
-9FD43C93            
-5076B428            
-ACB3DAA            
-5D0BA50            
-16E00180            
-90E21E72            
-D497B8D8            
-8414A6CD            
-B933AC93            
-18B2DC20            
-5BCC1468            
-101CA9C            
-5AF125FB            
-E65A4FBE            
-A5B927FC            
-A8163208            
-CBC14C7C            
-A00E7C50            
-62DDE328            
-3704BAEC            
-B354A1A8            
-1FEFA49E            
-BFA928AF            
-73EBAEEF            
-F21664AB            
-B82DC773            
-397C3EC7            
-6DF7A081            
-7B57E52F            
-43B47A0D            
-4BB8B26E            
-748CD62D            
-1D057255            
-3A01A19E            
-ED35DB9E            
-B9192006            
-9DAAEE03            
-6F88BC5B            
-41F22AAE            
-DAF9FD8B            
-8A8D06B2            
-99E4A71A            
-E0E5802            
-AF2050EE            
-35D07382            
-3CDB4F32            
-1587CDF9            
-29E0BC17            
-F6641B4C            
-35557A67            
-20B08FD9            
-F89BE3B8            
-994D534E            
-5084DC42            
-B49E2B0B            
-25AD0456            
-B05DABB3            
-102657BF            
-FA7342E8            
-508B7BD7            
-FED0EFE6            
-5EFAD4C0            
-15101C27            
-420BBBF4            
-1783F9D0            
-CA890820            
-BD3539D3            
-578ED490            
-1DA8E967            
-134F8B74            
-D6C5A224            
-8C8B1F06            
-8977D881            
-541937F5            
-9013604E            
-4B54F163            
-A9030FBF            
-A9EF1A9C            
-CB29FA97            
-94A3F001            
-4069BD15            
-C0D5E43E            
-4E17F81E            
-90FFEC8B            
-32D0B0C7            
-4044EC4C            
-7D7935C3            
-BCFF474A            
-9AD1BF76            
-2ED2D299            
-263F8852            
-4073932E            
-BEDCC036            
-7A548119            
-ADF45572            
-7D8C451E            
-465569B8            
-CA9E87A4            
-731803CD            
-1DB59C5C            
-A90C6543            
-A22221B0            
-173A0706            
-E040DBBC            
-941E546B            
-5503B9D7            
-CC5D8948            
-F7FE8FB5            
-1AA3AAD0            
-20229A2A            
-82CC4C33            
-746BC086            
-E9F90D08            
-2B356E1A            
-14897456            
-D9BC34FB            
-9056CB82            
-1DD450BD            
-BF64BC9A            
-166164AD            
-94363CB2            
-ED715F84            
-CF4D9ACB            
-BC0EA0A1            
-46E9697E            
-72428536            
-D9569B91            
-2B84C8EA            
-D4CDE0CD            
-E439EA2C            
-E19B71D5            
-E45E8566            
-541A4655            
-845B296B            
-B2E478AE            
-1A35840C            
-C94F4E9F            
-A7AB9164            
-AAF8D027            
-82252CBF            
-20106216            
-ACC1C08E            
-57E445D9            
-FF68B8B3            
-4DAE2000            
-B5A7ACEC            
-1E9BE78A            
-88DC5BAF            
-C8A00837            
-210B7F85            
-E2A072CF            
-144DA567            
-C6467799            
-4BC0A056            
-C60819E3            
-B2B1ED7C            
-C0ADC696            
-56F0E8AB            
-8D538C1E            
-879C3079            
-6EE2F434            
-7B9CD649            
-94A30F21            
-7DA211F1            
-64035D90            
-916A9128            
-EC9C52F6            
-92991BB2            
-53F4309A            
-5AA71420            
-F9B67D20            
-45706BC1            
-E71E83B            
-B091D34C            
-BE56577B            
-7D3CE09C            
-1A3F1DD2            
-F90362F3            
-3FD83E38            
-E8274EA1            
-CDFDF1C2            
-62FD4CFB            
-C3A1DB75            
-15E3C709            
-B7F81AF6            
-E58D41BC            
-5376E522            
-698DCBFB            
-C76EBF96            
-46682F6B            
-E5C0AE29            
-50259284            
-91A4E263            
-4B03C104            
-4B04D974            
-914FF9B5            
-783CEFF4            
-4B232A85            
-303E2F77            
-6E902ACB            
-8D630D23            
-9BE394EC            
-461237B1            
-22760BF9            
-B1F5BDC8            
-F8557002            
-9CA2BA41            
-76418996            
-B734B9D6            
-C5D4B1EB            
-59F49A63            
-4F9C6BB0            
-219811DD            
-CB536800            
-BDAC548A            
-824F1A42            
-5CE7C68B            
-AC7A5DE8            
-86D89A36            
-49E127B3            
-EE0E8BFB            
-4997152C            
-A43493BE            
-ED7179            
-1049E699            
-431EBDAC            
-379BEDAE            
-FBFB2AF6            
-72C255F            
-F37B5D5C            
-2D15F748            
-7759FCC8            
-D6730ACA            
-52AE1913            
-D709F4AA            
-581518C7            
-BE85DA4D            
-1A24C4D7            
-50ABC4ED            
-7B50804D            
-194F2CD7            
-A56680A8            
-1520F41A            
-A614FFCF            
-5F66A0AA            
-46877891            
-4926E937            
-74E93C8E            
-62515A1D            
-8F3F6DF7            
-AA4D19C5            
-8057E286            
-8C90FAB5            
-4AD3F2DF            
-D953B36F            
-37D20E08            
-644A2AFC            
-5CF19FD            
-8C9431A7            
-EEDC46C5            
-F86BE6DC            
-6C12ED6C            
-5EDE86A5            
-7E59C795            
-5EB83E6            
-6F36E55D            
-D9E35BDF            
-CC7E1D72            
-21A42C4F            
-332994C1            
-4E460BAE            
-C9A0955F            
-C080A0A0            
-B2013D50            
-E6CB68DE            
-E9C759D0            
-4A1C7783            
-D1028E6C            
-CEAC9773            
-189398E7            
-B57C20FE            
-D0D3E05C            
-6FEC2AAD            
-17643391            
-1291E620            
-978A16DB            
-37BE98F1            
-9F773872            
-1BEB32F2            
-CF3DA84            
-3088C11B            
-2BEB338A            
-1F308D75            
-DD542BFE            
-C568D953            
-BEFE8926            
-B9E201D5            
-EE6FA353            
-826FBE38            
-CC867513            
-A00D32D6            
-CE9B8989            
-8D3CA53C            
-1718DB6C            
-CE2AABE9            
-8FF0C7CD            
-DBEC0AA6            
-E75EC71F            
-FF266269            
-3D7D0B68            
-D606EE1E            
-56F86B85            
-6B67916A            
-B164B35A            
-D4E7337D            
-D7A68BBA            
-A39300CF            
-D7C72CA5            
-A32F6380            
-385F8023            
-1FF83E95            
-F4E55989            
-6BED2F68            
-C714269C            
-4D2E9366            
-8C1A2FE6            
-84756541            
-6D353F18            
-741B7419            
-3BE84DCE            
-8FFA851F            
-FCA5E50F            
-519AC53            
-2E36273C            
-995F9DF1            
-A1A165BC            
-F5E804CE            
-DD395EDB            
-7B2D8A34            
-FC3F84B1            
-19EE5FEA            
-EB2CA6C2            
-866CE073            
-B60059C0            
-35395446            
-BD2B582E            
-C6E73349            
-634D409            
-B9AAD6A6            
-81B516BC            
-6933344A            
-806F4464            
-22AA3AB2            
-A6FA442A            
-31DB2D66            
-F64AFBC0            
-480C5B8F            
-8CE98937            
-F8BF9101            
-395669D0            
-A560F096            
-C8A13D26            
-9C62AC71            
-C0EA2E1            
-BDC5E76D            
-51C79BBC            
-E84416E5            
-30CF1A91            
-E87F3E55            
-6CA51768            
-4D09690F            
-D488F996            
-ED850E82            
-510DA36B            
-709F9D1            
-A6AAD3D4            
-E0C4B7BB            
-1A581776            
-2F11B35C            
-748C7EFD            
-A2F0722A            
-A8C6D678            
-915B88D8            
-42E5FD90            
-25B58AA4            
-8FF166C2            
-B5FC3947            
-6427FBD0            
-E1C01EC7            
-91FD1568            
-FE570CB2            
-BBEE870B            
-811FA63F            
-BE89954D            
-C83ADB4F            
-C1B4D237            
-65AC0055            
-5E2B279A            
-3FC59820            
-B1634DAF            
-AC02E4BB            
-B9D8412B            
-AB22C318            
-9E528E95            
-F4220FD4            
-D83A7E2F            
-7C013BBC            
-23849524            
-BEED0AF2            
-C9AD6213            
-4F367F0B            
-8FBA0438            
-EC5899D7            
-A4111441            
-2D18DAF5            
-E7349E7E            
-57AC8D6A            
-A27E98E3            
-AA1A992A            
-5E7E0E0E            
-AE4AF437            
-20A80262            
-AE20A4C            
-2CA493A5            
-FFC756B3            
-68045EAC            
-A56BE46A            
-7B3EDB89            
-BF17C1AB            
-445B3851            
-FE16BE78            
-23D0640A            
-694D05D9            
-D76F0407            
-AAC3808D            
-8D2609FF            
-BDBECF1E            
-D6074958            
-7EA401E2            
-CAD394F3            
-4A67FBFE            
-A2A7FBED            
-59E0B573            
-CEFE2B20            
-2BE6EB1            
-85FF9E57            
-42C7617D            
-E9E01845            
-43F02D16            
-DF309F8A            
-880350B7            
-65CE706E            
-CA6A2B8C            
-5C38AA9            
-6C60FA8            
-42BAB35F            
-9453366B            
-D5864332            
-A25A3164            
-F32EDF79            
-C757635D            
-F6712B29            
-4C43A3E0            
-80D02D7C            
-A9DB16CA            
-55270F91            
-3FE8F468            
-AB0C835E            
-DD8A2F64            
-D9551C26            
-4642684D            
-69D1935E            
-9A7A2413            
-E0BEC20B            
-14724D4            
-B4A43613            
-559418E            
-1E4A709B            
-A32F1E7E            
-EFEFB7A4            
-5B26F487            
-E6CBF46D            
-7139D0C0            
-EC214DFF            
-7045BA9D            
-A9AB902A            
-CAE7661B            
-3B50F210            
-A065F80E            
-B353DA84            
-E6538D1B            
-965D76CE            
-E7F01488            
-A1E57BCD            
-76920B33            
-4EC379D2            
-43909492            
-8F621446            
-C9033570            
-FEEEB7B8            
-E6FFA222            
-E8CDDAA2            
-3C5C0252            
-A63AF91A            
-D545D3D7            
-28ABECA4            
-EA14F18F            
-23FF43B0            
-F9F0198            
-24568599            
-71F0C3DD            
-63975EB3            
-BF3AF93A            
-7B95B627            
-9B0D74D5            
-20967FF3            
-A621FE0C            
-6CFF968B            
-909CF3B8            
-79B5DFFF            
-FC87A4BC            
-5BB19840            
-DB7D8F85            
-D4641400            
-54449140            
-CA93FF98            
-85668EF3            
-C871B119            
-58D44D70            
-D93434A8            
-453FD827            
-906A01B7            
-FD446B38            
-CB63F172            
-E4B0DFD8            
-D4FE1E63            
-C78583A2            
-1D7463DC            
-7D69FEE0            
-93EECB26            
-337FCA9A            
-5D5D7447            
-1ACDDE16            
-C4CB8D59            
-F178B39F            
-292E3426            
-7A1A4318            
-DCCE0A6D            
-EEC1FCB9            
-3B264208            
-F9D7CB6            
-9A23DA53            
-58B2B3A4            
-654072EB            
-6CA920C5            
-E145E547            
-F5FF4A8E            
-AB7C553C            
-2A84E62D            
-6F6AE7B2            
-322DB9DE            
-17E670D3            
-7BDFB473            
-7CD05987            
-5B12A205            
-5E9FB325            
-542A1478            
-FF46384C            
-69DE91C9            
-65B4C13E            
-78DA8BBF            
-D85BC864            
-3882BAC6            
-444A8F13            
-886DBD37            
-2613D1CA            
-7CF2397E            
-513D4563            
-1C57D4F0            
-32B75B54            
-E18B4953            
-B59C2B91            
-98F11972            
-594CCC07            
-39BE7B96            
-B14E5D15            
-ED093697            
-953DA37C            
-6FDD4B93            
-8D678AE0            
-8B149A9C            
-B9ED6AC            
-E4FE210B            
-44EB15E9            
-805CE5D6            
-62FF689B            
-E6C011C6            
-42C85768            
-EC22FC81            
-16858F65            
-6A6BC5F1            
-E5090FDE            
-482D0881            
-65EAB7D8            
-620494B9            
-6160FAE2            
-542E102            
-81BCAF6F            
-C31AABA5            
-BEFFEDB4            
-A802765            
-68A8ED5B            
-A47FADCE            
-3EC1897A            
-4DBCCC04            
-83EAFD50            
-6B8E05E7            
-4FA1891A            
-9C2FCD23            
-9ED7C877            
-15FF9D1F            
-67DE6F18            
-D2932D4B            
-E4B31601            
-60B47713            
-C1326724            
-1F5FD6C9            
-2A54C06B            
-599854F5            
-C2121D8C            
-2D0FAD3B            
-762DB289            
-CCE2E11E            
-622AD608            
-29836424            
-C9F1F838            
-4E0F9445            
-16C53328            
-B9F2FC2E            
-28FFB831            
-7C216796            
-E065DC2C            
-561328B            
-92EEB73E            
-BBC5AE83            
-2DE49E4B            
-BB32B7FC            
-E59D7B63            
-B3375867            
-5523615E            
-5532A7B5            
-6890882D            
-21F33D70            
-EA855CD7            
-CBB7B3A1            
-DD9C122E            
-5CEAC143            
-E9E4332A            
-6F658BF6            
-57E90D54            
-715AA7A1            
-DE7768FF            
-D8A3302B            
-1BECD73C            
-AD442F70            
-EBBCB63            
-5D25E0FB            
-EF9854C7            
-DEBB6E96            
-61591E99            
-BE06EE6B            
-F74EDD0E            
-124B1712            
-45833671            
-1227307A            
-546B647C            
-9D2398D1            
-DDB609E            
-EB68EAF7            
-F05AFA0B            
-A6EABBB9            
-60B5FC76            
-992D25CF            
-A99743C            
-5FF72996            
-E3D84005            
-F47AC3D6            
-D92BCBEB            
-3AD6BC2D            
-399AE49E            
-FFD7134A            
-80856732            
-8C92A116            
-D23F2A7F            
-1C1FF7CD            
-7E97215D            
-63CE5EAB            
-1E3D6441            
-8CC7E1E2            
-3144CABE            
-1B369565            
-E681B9FD            
-3F72A224            
-3146105D            
-68639F13            
-61E4A798            
-CF28AF43            
-F18B6903            
-F4D16333            
-557BEB41            
-F5DEEE8E            
-41F036AB            
-D0DBBD23            
-E8E240CB            
-8FE50644            
-8EF8CB38            
-F8D6EBA6            
-580EDAAC            
-25F0FEBF            
-1E09176D            
-CD156787            
-8198153A            
-3D5D3DE3            
-5132C51F            
-4B39B7FD            
-15BAA338            
-AC2E0CAE            
-91DC2332            
-3632CBA5            
-2AD744AC            
-EF31B613            
-6A9D8019            
-17DE8C90            
-E5CC66F7            
-E81411C2            
-C5B6931B            
-E8CF72F1            
-ABF2E66            
-5B7DEA27            
-340E7880            
-2B4ED84D            
-F6E86748            
-9C181F92            
-55DCA269            
-1CEE9C9D            
-1DB0A271            
-B1BB73B1            
-2B802754            
-596ED430            
-25F4A422            
-E186EA6C            
-A0793E1F            
-B54A8F34            
-4EEA557C            
-A8085CD6            
-276D7E7A            
-F711A6D4            
-2534D88B            
-FA8CEFBD            
-A7E9E1C7            
-EF6F2E            
-4620FD63            
-7955C107            
-50E0A968            
-81DBA8B6            
-92E0F3D4            
-C78C01F7            
-CFE5AB0F            
-C290FC3B            
-F12CC1D9            
-56A9B1DA            
-69AC05FF            
-964D8EE            
-EB198C02            
-A3D9435            
-30D0BD52            
-2A1A5868            
-DF336813            
-14C97AB3            
-BA6717D1            
-43FC05DC            
-32A6FFBC            
-C47276AB            
-DECB3B2F            
-1511FAA2            
-155693C7            
-E5BB37E4            
-CB20ED97            
-FDFD4014            
-FFB25A3D            
-4F8B2CCE            
-8EC8D538            
-A60DDEE4            
-9E6196D0            
-8895A4D            
-A2528B98            
-D02F59B9            
-47662556            
-4FAB84CE            
-6C7FC2FC            
-F351CBF4            
-F1917707            
-B1F2737C            
-B46CC768            
-F87757B9            
-A24CA3F5            
-74EC8337            
-C46290C3            
-77BBC380            
-1B3087DC            
-C816F73C            
-6E2C562B            
-27C3E900            
-4FB423EC            
-A77B1E37            
-51063C80            
-432108D2            
-11F0367D            
-1D08F91D            
-D56068FA            
-F259DE46            
-26CF3619            
-6E6AF5EC            
-10AFB2EE            
-14F925E9            
-5382204            
-9F482CE6            
-90B0897C            
-C768AA0B            
-654ED88C            
-AD60966B            
-8EB54FB3            
-26275630            
-A1C50A7E            
-21587F6E            
-9496FD06            
-4B768A3F            
-1798404A            
-28C6B4D8            
-5B579E3D            
-C79ECD09            
-EC63FA6A            
-162A0135            
-7FB7DDB1            
-A0167E99            
-196F14DB            
-CCD227F3            
-3FB917CC            
-A3D30D38            
-71874379            
-E9E489BD            
-5DA989C2            
-4F7C8E1            
-F6E0502F            
-F8445D16            
-25CC5FFA            
-FB06FF63            
-CFEA3C99            
-E41A8123            
-6A5A256C            
-D7B67156            
-50BDCCD2            
-8165541            
-F067F327            
-B1E17258            
-6901F3B0            
-8B8CA0AC            
-CBA88A2D            
-4736E05D            
-DD5AD020            
-35B501DF            
-73C67F6F            
-F2C513F            
-E6CF7C2D            
-E6A85B1B            
-8AE4F7E6            
-1ACA7CFC            
-BCFCC182            
-2930369B            
-642DC973            
-990B6772            
-681EC185            
-164AC235            
-9C676AC8            
-B200AD7D            
-F13B8C8D            
-9D22DB12            
-CE95663D            
-CE956E42            
-29485F4F            
-BC5D5F8E            
-DAB561EF            
-C4C15BAA            
-77B9192C            
-86E8BF86            
-5933ECE            
-E50B93C6            
-F8B0CFB0            
-3286711B            
-DD558ED9            
-DD043899            
-4AFAB231            
-637BB2D7            
-87036D19            
-9A30430F            
-27798B63            
-4D6E407D            
-CEE251F5            
-ADFFB995            
-B5C885B2            
-7DF6519C            
-6EF51C85            
-B95DAF30            
-65EA99E7            
-772FBB19            
-49DBE1EC            
-F386A79B            
-EECD2F55            
-8935CCEC            
-BAC4C120            
-C71F82EF            
-2DF7E67D            
-9BA39901            
-9614A4E1            
-C6304402            
-236FC777            
-D47A5719            
-8098EC85            
-799E34F4            
-896EBD9            
-BAB10372            
-32ED359C            
-6F9F763B            
-9D517447            
-22B55AB9            
-8E6F4104            
-15BEC5D3            
-6252E010            
-23B5E8E7            
-D0B113BA            
-965C42E7            
-F2A0C19A            
-24CB582E            
-1F449982            
-2E805DF0            
-851608AC            
-755273C7            
-3529A161            
-6395258D            
-C5BD7D0C            
-27BABE75            
-E1628E4A            
-47E5CD77            
-EE797B13            
-AB11893E            
-2F65151B            
-9CE2B20B            
-233C28A5            
-749A0C91            
-846BC1E1            
-8C36F8FE            
-1489CF6A            
-70FB6BE0            
-D0A84133            
-9734B9B7            
-FF166A04            
-D118033F            
-BDDB2D63            
-6F6691F0            
-44FB36D0            
-EFF2B14E            
-AC02C863            
-ADFD2972            
-905F6E84            
-7C0008A8            
-4A043A53            
-D104FDC0            
-1687FF25            
-E6CF8FCF            
-120143AE            
-53F92C72            
-19E2E798            
-EE8C6B94            
-15CEA57D            
-C8968EBD            
-D50EFBA3            
-A8EA5FE1            
-E2D073FB            
-B4EE195F            
-8928A91F            
-6B9EB970            
-C24B509C            
-5D340563            
-85FC3F3B            
-934FA012            
-A2AB8533            
-A6BD3187            
-105DF0E3            
-243ADD05            
-49C299EF            
-7A42F84C            
-C90A1935            
-3268B298            
-CFA3B2EE            
-470C6457            
-E579D2C4            
-BB10428B            
-78D10FE4            
-11F21813            
-8424CE28            
-EA2B114            
-8239463D            
-9804414B            
-44B4FD1D            
-82D50F88            
-10AED1B6            
-E4768ADE            
-E7235A66            
-C8705714            
-936532B0            
-15C63108            
-92A91B17            
-154B2415            
-9BF0D15C            
-5F451388            
-1DC102A8            
-96CAFC23            
-B076C0DE            
-3EBDCC3D            
-6B2EE523            
-C6777AA9            
-F7F48C4A            
-B1E8ADBD            
-FA30AC90            
-5173D22A            
-D22827A6            
-6504AED6            
-3115E6F6            
-E8937768            
-C5ACC0E9            
-366E15FD            
-AB81C84C            
-C27AFE96            
-7361C8B1            
-613A0811            
-595F48E4            
-1619DFA6            
-233D2474            
-4C174E1C            
-E7DCC63F            
-308FDED9            
-502A0AB0            
-C5004E90            
-B7FBEFEB            
-918A77FF            
-F7235A04            
-5CCB8B7E            
-3BA4B1ED            
-32F47DAC            
-FF7348B1            
-996C8E7            
-7203F1B0            
-70583A2C            
-4D8046A0            
-551119AD            
-BE5B31AE            
-35400CC7            
-E8ECD409            
-D1C104E0            
-1A0858F            
-F26946            
-458C8B3F            
-E8D66E91            
-2F3F6384            
-B36EC71B            
-289CD4C6            
-6CA9E35            
-B198A8B            
-816873F1            
-346D66C9            
-BD906E97            
-802E5969            
-261BBBD1            
-9D7605C6            
-72C2CDE6            
-6C8DBDB5            
-D7C8DD7C            
-F43FB2C8            
-A9F384E6            
-78FDC918            
-6D20841A            
-20755F34            
-F4C6AF99            
-19393B53            
-A525AE84            
-CE881A38            
-3D075300            
-9B0E4DCA            
-7EB7E7A1            
-4C4FD44A            
-78483ED6            
-32D9D894            
-1CCD379A            
-EA5FEB4B            
-F7E001D            
-44FA69A5            
-E99F66B6            
-9E16CD0B            
-CD098C41            
-6DAAD279            
-5FE50411            
-CC855E2            
-130C6563            
-356CD9A1            
-BFB318B8            
-2E963C0F            
-DC5A046A            
-FE16FB            
-A599857C            
-F72FE561            
-2914E4FE            
-B247AE8D            
-6A6F13C0            
-B1052C98            
-8086E53A            
-845345BA            
-D43D5F7A            
-82B30F5E            
-4206EB1B            
-89CCA1AE            
-86289F6            
-567F22DE            
-25624C58            
-6A78EC3F            
-7EC32D03            
-8017213D            
-3A141336            
-D1CA4E6E            
-FA84C2C            
-FE670E0            
-3238E01            
-18DF1794            
-A7B900AD            
-1FCE47CD            
-14EFDCB1            
-C21B04A8            
-4C3343A2            
-E5E611B7            
-ADD06EF0            
-32C81695            
-201A9FEE            
-BA8925BB            
-5182EEED            
-7DA4917E            
-CC331235            
-C304ABE9            
-C2A16075            
-937E1C4C            
-CCA0184E            
-9DB6C45A            
-3F2A79C9            
-151B469E            
-162F22DA            
-D955D54E            
-E857CC0E            
-FFF2005B            
-60AD87FD            
-85512214            
-E0A506A0            
-FAF1A145            
-9DA17F03            
-332D26D1            
-9EDF9643            
-7BBF2D9D            
-3414FEA0            
-A8FE5964            
-D4841879            
-3AE4E5EA            
-BC6B6D60            
-950F4693            
-70FD0254            
-177C7A1F            
-635FE5B9            
-C0C5B6CD            
-15D1D22F            
-BA495903            
-CC100F38            
-A5F1E225            
-5AB4584F            
-AC4731FD            
-ABB04167            
-A0E153B4            
-5982BDA9            
-8E2EE3AF            
-D635C631            
-7C6154A2            
-9F0EEFEE            
-429B22CA            
-B1346D4E            
-6B21663D            
-6A7EDD8A            
-DA34A355            
-217132F0            
-683BA78            
-9CD46320            
-A5D3BC4F            
-3194AB03            
-DD66F958            
-E7506C47            
-17EE83A2            
-4E4D80A0            
-EB56662F            
-BE889C58            
-6F5F6745            
-2A05C12F            
-13D266A0            
-3B2B18C9            
-EF435E02            
-5604DB7F            
-D35888A2            
-CCC34421            
-55E24355            
-7F607F34            
-E493720B            
-C6A492D7            
-7DC6A789            
-E01474B2            
-97D35C32            
-71F32335            
-D3083D7            
-2327D424            
-35EA4BA1            
-F5B20C6F            
-3ED28FCC            
-453A76AE            
-192A79A6            
-2E64285D            
-A9463AEB            
-374E22E0            
-92A5CF8F            
-E707F8E8            
-B8E2FF36            
-E8E959EC            
-91D9796C            
-F03960F6            
-B62467FA            
-8836A487            
-6418A93F            
-60932160            
-3B72687C            
-37BBD7CB            
-1001C76F            
-201999EE            
-5955A1CA            
-925351D4            
-767540E3            
-570BBF27            
-A073D4D8            
-FE96246A            
-44784995            
-232C0150            
-AB7BCE2            
-D47BF099            
-BFA6A422            
-70F4BC01            
-C2139449            
-F9ACB817            
-26657111            
-13263449            
-7989D26A            
-2E972B3D            
-2F1C1C6            
-930E479            
-23243FE7            
-BA7DDF9C            
-50C8AB43            
-952377D6            
-4C6C2B3A            
-BDAF48F3            
-1C0BAE6E            
-7F6A8C04            
-F529B9FA            
-9ECA4162            
-342E6562            
-9BD5EB52            
-A14DB3C9            
-14B1DC2            
-4E1BB6D1            
-9A1158D5            
-73F84EC            
-685BD9F5            
-8CE72161            
-5F116605            
-BA861D43            
-A7150AC2            
-391A105B            
-C8D798E8            
-16633750            
-33B29C4C            
-54211362            
-34C2D5FB            
-CA197734            
-A635990A            
-4E606FD7            
-9D56673B            
-89976DD5            
-5F2D2794            
-81E95955            
-9377829            
-5DED53B7            
-FEAD5592            
-1CC6419B            
-BD3A45C6            
-65FACDCA            
-7EAD0EF3            
-EB856702            
-D857FA75            
-3B92DC0D            
-E66AE58C            
-51912618            
-C63C75BC            
-ED05B556            
-17EC2B32            
-9F692578            
-C706059B            
-D88D5576            
-C2661C7B            
-6D7751C2            
-119292CE            
-418700CA            
-2A2BC3D8            
-CA20D341            
-8A8F325D            
-D4A2DC8D            
-959FD62            
-67883F8E            
-FBD3686B            
-6B862363            
-F8C13880            
-FCACA893            
-8215D90C            
-67567E2D            
-3B501BED            
-7AFBFAF4            
-2EC3CC34            
-B360BFD9            
-716C5E9A            
-907B1432            
-E253CBD1            
-4DB52F87            
-6A37A21F            
-C860A6A2            
-72DFE5D2            
-84E0705D            
-80DDC195            
-1ECD4E92            
-2D2035A1            
-B10A5B53            
-C9AA9A79            
-E999CC8D            
-C8C790EB            
-F7629DFA            
-93158872            
-FAB6E7DF            
-58A0A3D            
-6104EAC7            
-2BACDD14            
-A8E3DE88            
-AC4E16F4            
-F7042189            
-5AA6D923            
-F491667D            
-C769767B            
-46EE7E69            
-CE4BAE4E            
-FA1BE581            
-2BF14278            
-5356E813            
-6225B503            
-D33A6F26            
-1A629247            
-BD844A35            
-E33ADFB            
-EFE720D6            
-3D49752E            
-AD542CEB            
-EE36C608            
-99FD833C            
-BA893EF7            
-47E4A8A9            
-B269C1DC            
-CEF39BB2            
-91FD5B03            
-C02E6C1D            
-29A3817F            
-70894875            
-8C851D1B            
-8446E920            
-8CBAB8AE            
-D9D7B185            
-97987DFC            
-ADE83493            
-4CD1FC4F            
-1D82738C            
-27665936            
-CE3C907            
-990136FD            
-E1E40CF2            
-A3E15CA6            
-DB7D4E0F            
-D8E87ED            
-FC23DA2F            
-76A6A0C0            
-1C7F403F            
-380BCEC9            
-C2BDE917            
-74145443            
-14C0823C            
-8D73C415            
-BD7B9DB4            
-C83449E7            
-364D21C7            
-7F01C97E            
-9ED9F208            
-51417FC4            
-D557CFF2            
-5ED6B81F            
-BC0EBF41            
-608D56CA            
-60AA90AF            
-8FC8A8D6            
-809BE4D9            
-47CD9035            
-8CE71201            
-B442C067            
-A380EF4D            
-7B74A914            
-513ADF78            
-63E5C752            
-6D4F2B4B            
-82717D99            
-EC19F48C            
-7D0D1EC5            
-944D936F            
-358B8D1F            
-D3A7E17D            
-5E6DFD92            
-D6D2B538            
-133AC914            
-22C4BFCB            
-A9F4ABBF            
-7DDED93D            
-6836C5            
-3F10AEBF            
-71713080            
-A1868A02            
-EC341DE1            
-33D409F1            
-41EA5D35            
-47F18F89            
-7C062A2E            
-1C66DC90            
-D5E11362            
-FACCDD77            
-D96EA1F2            
-31676D3            
-B00B9D1D            
-36F80278            
-754F427            
-3D8C40A3            
-D1FB426C            
-ED4869D3            
-AD137726            
-9704A7D6            
-107A0E2D            
-AAD92A50            
-58019B5B            
-F6FD55A            
-E876FBF7            
-13451AEB            
-A530BF41            
-11FCB24D            
-EF5D7F1B            
-BB65E3F3            
-DCAF1904            
-4262AE51            
-8C2318E1            
-96E7A13F            
-DDA281E3            
-7B44E7BF            
-8048EB55            
-AFC8D749            
-D3F7E592            
-23FF8DE            
-105E2923            
-969758CE            
-B1BF840D            
-D301EDDB            
-42A3C6C4            
-2C934ECA            
-B2FB9ACA            
-452302A4            
-C96F49CB            
-D7342392            
-48A6D82C            
-6B831657            
-1A6989B2            
-312D282B            
-9AC1D170            
-3FB3070C            
-D83B178C            
-D894496D            
-5FFA91E8            
-436E970D            
-54DC6812            
-8CCA890F            
-96971388            
-9CED7192            
-216196F            
-BDBF8734            
-441B7DC6            
-8FCB2D4            
-1C3375E3            
-19EE1338            
-E8BD4F25            
-D65CD246            
-85157D36            
-34A4CE5A            
-BFF7BCD5            
-41DD5123            
-D92D0021            
-C0265B3            
-652BE05B            
-7B31FC27            
-E8BBC732            
-E5DB7686            
-2D1EAFF8            
-2283884            
-CE0E4257            
-1936BB27            
-6ED44FBF            
-476ED2B            
-C249E9F6            
-21C0827C            
-8DA28ECA            
-707E075B            
-10EFDAF6            
-3DF4B474            
-24AC5C3B            
-81F8A453            
-8E1AF272            
-E69E1816            
-C40F1B4            
-5AF2AD1A            
-C1236EE6            
-78507240            
-588C4851            
-385396C3            
-BE2210DE            
-E8FC3FE2            
-B9E7C8F8            
-A33939            
-B9E8F7DB            
-F7DF1BA4            
-400E6C2F            
-1139C2B3            
-8195BA65            
-A6052E5F            
-29E1F01D            
-512ABDD6            
-ABE172A9            
-350BB8FB            
-63D89399            
-6C7CDD2F            
-F6E20A15            
-36947843            
-7D26A79A            
-133DF31B            
-AB375C67            
-35D4F0E9            
-8060F5A6            
-94893A4F            
-1B4E1612            
-431938A9            
-F4F22D48            
-E83BC91E            
-98D9DF02            
-7CBB518A            
-947735EF            
-16DB6C38            
-7BBEB95B            
-393A60CF            
-6984032C            
-F1879BA2            
-F014440B            
-61CAEF50            
-F9BAA90B            
-6D9CDB7A            
-4A4C3D3F            
-DD498DC8            
-E27FE395            
-AEA01257            
-15FEAA99            
-61A173A1            
-28EFFD56            
-A27152DF            
-10C613A7            
-47AFE324            
-5B4D4B5            
-AF67027D            
-11ADBB9E            
-F8B22312            
-4A9C0C1D            
-E94F39C8            
-9AA4F0E2            
-4C394A49            
-41ABACE1            
-6A96270B            
-171F3E81            
-F29DB470            
-A9E7F67E            
-6B445012            
-B53EFB86            
-B0AB92A            
-484432B2            
-7C789E2            
-116B012D            
-5A5434DA            
-83DD29B0            
-418637F4            
-C9E1FBB7            
-FD84E0E9            
-BB44A4ED            
-4847C699            
-61807BB2            
-F558A9F0            
-264F9191            
-697F6915            
-EBC115CC            
-A1604C6E            
-9CD73651            
-50ADAD72            
-DE3698D8            
-DAD728B2            
-58F5527            
-C58A4754            
-C8CCF740            
-A5CD4E0A            
-966E50B5            
-6DEA9EAF            
-66DEDD5B            
-CE18EE1B            
-E0293294            
-3C0C586C            
-ED04E099            
-A1BB7722            
-78AF5367            
-3F0FBBB7            
-4F623EEA            
-E3E1A85A            
-3C8EE1B0            
-D2851D20            
-F07248A0            
-713EBA3            
-8CCDC87C            
-B5ADE0C6            
-54DC4354            
-F7F43DE5            
-AB512848            
-69136DAC            
-71CEFCD8            
-5F264F19            
-D39D50DA            
-A184BC23            
-57F38C31            
-34DFEB30            
-6B39F755            
-60F7B6C8            
-EA7FF406            
-914CD331            
-F4A15FC9            
-68DB20A3            
-6609D547            
-18BD6EF6            
-F5DDB763            
-9E2C6236            
-A9C0CD72            
-EE8A864E            
-FA9A7891            
-DCE7F5DE            
-4E5A9B63            
-FBC574F8            
-13C26C91            
-70A2AD7F            
-9514018            
-7786A6DF            
-708A442D            
-8AC98261            
-57EC9F69            
-D8B92F1F            
-5525E8BD            
-CFB927EB            
-47BA617A            
-4A71DA0F            
-9632F7DD            
-4A00D653            
-3FC603A6            
-A34C3C9F            
-EDFCB326            
-BA31E996            
-4158D5            
-888F01B5            
-F001473B            
-D67ACDF1            
-587F7E20            
-EC9AFA96            
-6942D697            
-76FEFEE9            
-ED260881            
-53D50BC9            
-43FAA199            
-DA4F8CB2            
-D7FE8FC6            
-7A659755            
-394C88C8            
-EFA3AFA            
-87710DA8            
-DA1FF12A            
-C5D4E7F8            
-4F0A47D7            
-E7C2A799            
-EE894D65            
-20E4FD0E            
-8E51626            
-17BB7611            
-E48021B1            
-4320CA45            
-5315D225            
-39684701            
-3E943281            
-B3B7B298            
-A63E5C66            
-11F2EAE5            
-2E339781            
-9BE79114            
-187467D            
-9479787B            
-565D0658            
-B43DBE73            
-67F7EA80            
-D1962413            
-BF4B89AF            
-AC03F363            
-1587941F            
-B7A14BD6            
-AE1A36A4            
-BF710690            
-8009F7B0            
-FB37D608            
-58934215            
-327E7B3E            
-A2BCED7            
-57DB9C90            
-3E7E56C9            
-E554BE2A            
-6B6273A0            
-766F5A68            
-503BD141            
-586BF1E1            
-AF75978E            
-D93FB741            
-75268390            
-BDEAB299            
-9871DD6A            
-9C042A7A            
-4CED46AC            
-706B559E            
-9C9CE827            
-EFDAEFCB            
-A1AA3846            
-330AAB65            
-602F6FCE            
-DF14BBD9            
-8BEF0FE8            
-CEC4AC8B            
-28456573            
-95AB0149            
-43E11079            
-B50D7970            
-6F8F89C6            
-B96DCC6C            
-E114C8BD            
-CF3F36AA            
-E02901C9            
-8B452A2            
-8AFEE7A2            
-FD7C3D61            
-4DA46DA5            
-BD5C204A            
-83FB677D            
-42615EE0            
-3783255C            
-9FA48033            
-270F0FCB            
-157E94E0            
-CC89D359            
-715FCAEC            
-32EF8DFD            
-829D0BCF            
-E4FC364E            
-A629CB9D            
-7CE1FED6            
-D6E9FEEA            
-24E55CE7            
-8BB2DA23            
-2FAEBFC0            
-AD6EF205            
-96142124            
-6891653D            
-C5061A39            
-9EA7F89C            
-D2CA9BBF            
-544A569            
-E908D41E            
-EAA11FBF            
-4250EAF7            
-6A5E60CF            
-5F84A53D            
-4324D154            
-57320611            
-DC3C692F            
-24685A97            
-40F011E3            
-25A224E            
-3712F01            
-30F1AB94            
-45F92B8A            
-450F8D4E            
-F3EFF92B            
-EA54D0BB            
-7E10A58D            
-D51BDF85            
-FA6E7358            
-A16E06FB            
-CA158DFF            
-9AAFDAD5            
-AA48F649            
-A4A78E50            
-F2F73CFA            
-519FA6F5            
-32933CF5            
-9E55F1C2            
-806019A2            
-E56E0B7E            
-5F598AA3            
-564C6D40            
-757BDE5D            
-30757BFF            
-B906BD37            
-52C6C503            
-D2B00C73            
-5969C7A1            
-84FF193D            
-E668D8D1            
-71E66078            
-A200D7C6            
-6585828A            
-FF8864E8            
-B9EED36            
-12C9F3AB            
-2F2C4A2D            
-2998FE0A            
-A1D47491            
-59463A75            
-1347C537            
-77000037            
-E6AC6FFE            
-C74CADE7            
-83B75335            
-767A69EF            
-4248CAAE            
-1DAA4A34            
-BBCDEA3E            
-CE177B23            
-59449B11            
-A9DC563D            
-85589ACB            
-8926A959            
-CADAB503            
-6A1E5AD1            
-E79EAAB5            
-9C25D798            
-B4750BE3            
-249329AF            
-724F7831            
-F4D2E094            
-CD605F43            
-CCC933E3            
-4231A56            
-8D15BB64            
-A7B1E394            
-FF2B04CB            
-7260C6F0            
-A483E58C            
-35E5FBAC            
-A3D734E9            
-64BF02D7            
-24F8B625            
-FBDA78F6            
-6FA335D5            
-5CAAE8EA            
-EBE22B69            
-9BE5C3B2            
-81028FF8            
-E20FD2C2            
-CC8506BD            
-E079C912            
-BDE0AE94            
-AA4AD182            
-AE682162            
-AADAA077            
-C757CE81            
-E4BBF694            
-8ACFF53D            
-D1E85D5E            
-E29E9979            
-9DC46E06            
-A8FB412B            
-CA71D109            
-987A6F6D            
-E5A13D87            
-BCF3C6D6            
-DA5A6320            
-E78095AF            
-C0C4710D            
-7F06A362            
-FF3D8A8F            
-428A02D8            
-2EBFAF55            
-D25B93D4            
-344E75CC            
-ABC855A9            
-E3577D95            
-843C4274            
-F5326A2D            
-EC6EB288            
-7C4C82E6            
-A70953D8            
-8D8B314            
-8772F0BB            
-3BA5025            
-1BE5CFF            
-9592B505            
-B9FE16F1            
-EF77DAF1            
-4C7B4119            
-8B8FEB44            
-3542576F            
-375EBF3E            
-D0927BE5            
-2C6A3AAE            
-45D18D70            
-6126FAB3            
-58146389            
-FBF50CF3            
-3129860E            
-4B721C54            
-95BCFF3C            
-DDF12106            
-1E2428D3            
-827395A7            
-35266B84            
-3CC089A3            
-B8198C2A            
-B8EBD35B            
-7EBB213B            
-A93DCCAE            
-CBB25C42            
-2A03D874            
-46F6CAA            
-82986B02            
-47EA89A6            
-2C3E7BDC            
-852B0630            
-A928EB9            
-66A2BC66            
-BBB43A54            
-A6F55CB7            
-FE990460            
-5FA8BA0E            
-1CD34B74            
-1C0F2BE4            
-FE6C53A3            
-C325B6C1            
-A980B3D1            
-9F031392            
-31E17C1B            
-38B6D6A3            
-E30D49E5            
-E83F8C4F            
-BCF13E0E            
-28124F6E            
-57AF5DDB            
-691BCC17            
-BD071C94            
-DF4984C2            
-8579EA0F            
-92150479            
-7BB67579            
-58D6EB84            
-97754D0C            
-F569F71B            
-9990D0B5            
-56DAB760            
-9E988907            
-9679988F            
-3EC5E4F4            
-328D67D9            
-317EB4E7            
-5E6D7E6A            
-BFEE035F            
-D12E6060            
-4F2A7A2D            
-F65F5B73            
-54AE1242            
-ADAD3A5B            
-61A81471            
-FB09DC55            
-72874DB5            
-5302F1D1            
-8B5F6A90            
-82E98E7F            
-E808315D            
-DDF5B32F            
-C35356A6            
-6F1FF7AC            
-1549941D            
-1460BF8A            
-D53684E0            
-1A384C42            
-D319924E            
-B0B1824A            
-2772DB36            
-BA61B594            
-712F9397            
-41F5740B            
-C00A34B2            
-F2FCE526            
-4C874DC6            
-FD5ED831            
-301E874C            
-CE244111            
-D6AEAE23            
-516AF534            
-FC101FD2            
-EACEA514            
-C23A0FCD            
-650BA0E6            
-5C877E20            
-ACB5DAE4            
-5E56E78C            
-1AE6F2A            
-705046AF            
-7F53EEE7            
-AAB30590            
-2A1BD5B6            
-300A6D8F            
-FECD64C6            
-A8FF2EC9            
-27B583C1            
-29CAE718            
-66D59871            
-16E8C79F            
-14D20B3B            
-446862AA            
-1C5EBC93            
-3831B437            
-556E9FE            
-B877897C            
-D6FE7901            
-D19ABB8C            
-964EB757            
-D1DAC489            
-B60AFF4D            
-31D01640            
-A963359E            
-E233B856            
-58D923CF            
-EF31455B            
-EC071BC8            
-94F64E2E            
-F9384093            
-36C8A1F            
-AC4A701F            
-657CD41F            
-731CAD58            
-374B9753            
-EC20E4D1            
-E58959AF            
-E83E1021            
-B7C14D53            
-A651DDBA            
-D54BD80B            
-7291E323            
-31310762            
-A54A712F            
-482BD448            
-1FC7B562            
-EA69143D            
-4342848D            
-C4BB4C5F            
-B0B43A48            
-962EF559            
-5C395F65            
-6C40A83D            
-AEC344E3            
-881E5E3A            
-42D50FC5            
-144B9CA5            
-15DE8B4E            
-AB91DED2            
-17FCB1B5            
-87804536            
-102205D0            
-E57C9F29            
-5D08E2E1            
-A4AA0B4D            
-4FB1351D            
-F3BFE5C6            
-5C439E04            
-33A0A6AB            
-826A9A49            
-D165E206            
-229A4A83            
-4897797B            
-396C7F04            
-474B2792            
-351AD33            
-ECCFA3E6            
-901B77BB            
-42B16DDA            
-FB3F707C            
-C6816341            
-CE19D1AD            
-8297E119            
-4458AB5            
-FD9CA7B6            
-250517BA            
-2E23BFF5            
-F0D1C983            
-699A7882            
-557EB3B1            
-D0D5822D            
-D1117539            
-F271C507            
-9364161D            
-6793E35B            
-8AF902C6            
-DA5443B8            
-EE1E1A0            
-B941E448            
-DE0E773A            
-4A41AF87            
-D4AA88C2            
-80B09F9E            
-53F2B381            
-1C8EA42E            
-3D15C64F            
-93FE9251            
-B242B629            
-F7ED2942            
-6AAE674C            
-EBF19F56            
-E299D4A8            
-4F22DB1F            
-20998388            
-4742F182            
-F6626B60            
-992FB48A            
-26822FD4            
-784D31DD            
-B84CAF35            
-B8163E9E            
-2A27EE0C            
-FF09CF79            
-81C74BBE            
-C914DAC2            
-E768AAF6            
-FFA5171            
-CA93E6BF            
-E495891A            
-482A252B            
-18F8FD7D            
-DE52E34B            
-A4986019            
-E363E1CB            
-EAF53373            
-59FEDE9F            
-2FAEAEB6            
-DCE56F6D            
-F10257B2            
-7609DFE6            
-4D0D263A            
-12696B9B            
-A56E0541            
-8F12E1B7            
-9E8E5761            
-98C5816A            
-F2F8EFA5            
-B91C1CF3            
-59A19F9B            
-9235B967            
-A58D23DB            
-71377517            
-C50BCDB3            
-60D31A7A            
-874811FA            
-58A69900            
-CD8198EE            
-E4FA90EE            
-51352862            
-3654B5D6            
-B0442DA9            
-5BA67D5E            
-A9B84B57            
-FF61069A            
-21102ABD            
-8E6B59D            
-1DBF72C0            
-9772AC77            
-F26B2827            
-E985C97D            
-CC311683            
-E8216C66            
-13E346BE            
-199D0C57            
-578B8B90            
-84462520            
-7B33C9F9            
-E18A5CC0            
-8F70C75D            
-B9773D99            
-8A8BDCAF            
-78B8631C            
-1AA0C9F2            
-76FDD536            
-8CECE336            
-999E6F4F            
-29EB2768            
-3417B854            
-A56B87D4            
-CA2F016B            
-69DED6A1            
-8AF8128C            
-27732A2E            
-654939F8            
-F0DE0291            
-501F84CA            
-815055FE            
-99B595F6            
-627F49E7            
-2A7BE8CB            
-959032DB            
-7FD03C7E            
-54ADDCA0            
-62EB2DA4            
-6E458899            
-2FE00E32            
-B2E74808            
-35803F87            
-7369F52B            
-1586B4DD            
-61B61CC6            
-1BDD1B8F            
-C6BAFAF5            
-C4339DA2            
-E1D3A0DC            
-8AD49CC3            
-673B67FD            
-D81B434E            
-A41C5AA6            
-BED70576            
-22877C0D            
-71A3DC2A            
-FDE1F4AB            
-4FA1751E            
-DADBAFB0            
-1C44975B            
-76EE876B            
-E3B81546            
-86466730            
-6A3F403E            
-255A72F8            
-2D2AAE1D            
-77717644            
-63E003E8            
-40CDF1FA            
-FF37E1B5            
-F0FC3CCA            
-45BE9807            
-D8611D58            
-D62AB82            
-EE875225            
-B8149434            
-FFD0F0EB            
-2F3699E6            
-7EBD4BFA            
-3E393CC6            
-39777EAC            
-FE2A33EF            
-9AECBEB3            
-322B14DC            
-DA2EB056            
-1C942882            
-C42C7C32            
-A20E0D02            
-E91D2834            
-D465D9D1            
-FC60192C            
-D3B7FCA1            
-1E9B03FA            
-40323FF4            
-DFA3D47B            
-2C26930E            
-391E6E18            
-E340B164            
-36FD76AB            
-204B0D9D            
-5F5027DD            
-FB05E9F            
-33C3443D            
-ABF1832A            
-152FEBC6            
-FD83B071            
-310222F3            
-E07F3402            
-61818FE6            
-6E14F915            
-F89FE609            
-86FC4F17            
-C860D97A            
-51B0EF08            
-779B9BA3            
-6D9C0908            
-D14ED3D6            
-692E8084            
-233DEE29            
-B85FF171            
-12FAD29A            
-D37B7593            
-AEDD969F            
-8E76CAF6            
-A7FDDB58            
-B5B7DFEF            
-A8881968            
-50D65153            
-D57A8EEC            
-7D144C49            
-99B10DC            
-5660CCA2            
-C02A1001            
-7EE499CE            
-8C281511            
-8B43EDB4            
-31E58C4            
-E9EAB787            
-48BD8C20            
-87C33E72            
-9FD28F45            
-9D8374B3            
-3AEBB8FE            
-D25F7E5E            
-65B705F8            
-ACB7BA8A            
-C7CE28F4            
-1A365014            
-12997929            
-BAC3250            
-3DA4DE9C            
-D90B5C3B            
-731BC23E            
-F952A129            
-E5FECF74            
-26D6A0            
-B61C74A2            
-B18937FA            
-E034B86            
-6B3E73E1            
-FC5891FE            
-E6F5F72B            
-BE380D96            
-DB6DA2C1            
-8BCAC0F9            
-FCE57C36            
-10230AAB            
-8E0B6278            
-962C5A14            
-4C257AA0            
-95B50454            
-478B67C6            
-4BB1F24A            
-9DE453A7            
-241965D7            
-DE5E4EEB            
-77BCEB46            
-A87FC004            
-4EF35145            
-35910ECD            
-8900342B            
-C9A653E2            
-9AA2501F            
-DD4D16E8            
-A2340ACF            
-F846821            
-9A2A16D3            
-33BF35C8            
-185C4C5E            
-9A3A7865            
-6CA5232C            
-8A93214E            
-8F9C13E3            
-CF212018            
-777D973A            
-3531924D            
-DAEBD9FA            
-4C4BA7D1            
-C6DD4E96            
-72F0CF35            
-AD82F177            
-B8486F78            
-C89FE003            
-991E4764            
-F49CB023            
-14C3A164            
-B6B2733F            
-F78D6623            
-F1C9D84E            
-6CE9487C            
-68F59E42            
-B13A9862            
-A60DF7FC            
-5680C3EE            
-8DBB03F3            
-FE660987            
-7F302425            
-98915B            
-3EFAFEFE            
-819E3A26            
-CF086D8            
-EDDF6ADF            
-314D6342            
-C7DC4A97            
-231D9E12            
-C8F0BB37            
-E2A20026            
-A9539B54            
-E2047DA5            
-3E5C9D4E            
-F91C18A5            
-37B1EDB1            
-DE88277F            
-765DEA9D            
-555D803F            
-6FAD1516            
-41299623            
-66D3E9F            
-B040E22F            
-28C55A65            
-F5BBEB1            
-8F85CC9            
-C1F1FCFB            
-E0ACADA            
-FD138889            
-F4E18B1B            
-6EAD0B49            
-38441326            
-17AEF5F            
-5A6EF970            
-20ED5B3A            
-46A95C2B            
-CA7475C8            
-8FA66C0            
-3F831698            
-E2C27DCC            
-7AB6C35D            
-9D979A50            
-27F30FC            
-4FA19438            
-321E637C            
-AD72B955            
-C7BE128E            
-A428B5EC            
-48817E5            
-7EBF668C            
-8DCEC036            
-272C5582            
-F8175767            
-6ED7A880            
-71E2497F            
-6EE3595D            
-D2579856            
-15439021            
-87C91FDA            
-A5682821            
-E3FC8D77            
-1545F959            
-6341300            
-D52520B7            
-B0A0FAE6            
-6F1C6BFB            
-226DE897            
-4449D2DD            
-7E378981            
-55A93F85            
-91BFE157            
-434EAE2F            
-AEC8DFBE            
-929F369C            
-DF654EA5            
-CC2D5431            
-152C1E93            
-D800D93B            
-1969CB8D            
-46776BE7            
-DF3D435C            
-2CD82C1F            
-241528BB            
-88B41461            
-19463B47            
-CD61AE6F            
-3C5DFE3            
-8053B926            
-5D0C9D00            
-75240C8            
-53A9DCF1            
-B217E766            
-616C0F89            
-E73E36F5            
-1E3E0BC3            
-B6C474CC            
-9AFE8273            
-AAA496CA            
-E9770A12            
-9C3E2617            
-3CB73C1B            
-2065FF5C            
-3A2B3E59            
-280EF886            
-B6A728CC            
-DDEE48DC            
-BE40F70            
-449577CF            
-E5D72358            
-5648EE48            
-F6B9BB34            
-F8E354C            
-84895AB6            
-95DA9283            
-882AF6A3            
-4FBA089C            
-D27070D7            
-17784421            
-DDEBCE6E            
-4E6A43B3            
-82AE90D7            
-1A524C8F            
-D1C0C339            
-993FA3FB            
-52CCA574            
-523FF9E9            
-764B2F69            
-621F0749            
-5C95BE3E            
-F2A36CAD            
-5C92ADE4            
-F4238C46            
-BDD0079D            
-CAE6D9F9            
-5F3D1307            
-9345998            
-22C3C499            
-631B8B0            
-A6B9A88B            
-471749A7            
-6BCD27C8            
-5D371C05            
-57081397            
-F6CEF315            
-1BACE19            
-B7BF405            
-5B6DD011            
-BC74DA95            
-781349E            
-F22A975C            
-72A5A101            
-27BB6AED            
-933B9126            
-14FBE3BB            
-50D095D9            
-1CC937B1            
-22CBC28            
-1A6135EE            
-197E93EE            
-26A1CB1B            
-79BCF079            
-A0134157            
-9F232A75            
-818BB26B            
-B2339659            
-911E36A8            
-AF2F9282            
-347C34E8            
-6255FF5B            
-1BB79854            
-9A16AE8C            
-2A3D9B7D            
-93795FED            
-8284A6D4            
-E58090F9            
-A36C45A3            
-F8065618            
-4122FC06            
-6F4DC90B            
-5336936D            
-F4E4BEDF            
-7A885091            
-E19CB61D            
-9D398B7E            
-C9C4AF2D            
-A1C076FC            
-BF60AE9B            
-CBF56B80            
-11038EE3            
-4B78AA1C            
-59C72649            
-D687CF08            
-B182CC2E            
-43E4B13A            
-83126FE9            
-EB042718            
-627C8807            
-47474E59            
-3D317A4            
-33919B88            
-E00CD1A3            
-3CC1F4AF            
-2E91597C            
-CDDAF2BE            
-3D3A18D6            
-5BD6E47E            
-3D6A5286            
-456410A0            
-2B51CF4E            
-B55046FA            
-FA43946F            
-F90AC852            
-A064AFA3            
-F84235C4            
-D316F3D2            
-1BB0D769            
-46905EBA            
-255EE03A            
-EB4D2C17            
-6AFFB5CF            
-D755618F            
-ABECFB93            
-594CBE9A            
-362C1B5            
-ADFAAF67            
-ECF2110C            
-E86FA43A            
-C789EFB4            
-D9FDCC95            
-F81FFEBB            
-C239F63C            
-16BBBF2F            
-B1AFC20E            
-B00BCEFB            
-D6B41A49            
-A5856CBF            
-E2753B3C            
-8C03166E            
-537BA621            
-B268C813            
-C1B8E5B7            
-1FCDD47C            
-BB257FF0            
-37B89618            
-6AD0F548            
-C5EB6B1            
-482EAE33            
-1F898EA            
-C161076A            
-8112502F            
-77D0C22B            
-B1EF60B9            
-D8122593            
-D0ED144            
-A258567E            
-7FCB11B8            
-FC01313B            
-8A39DE11            
-B9612887            
-FAF9C5E9            
-AFB24528            
-C51F261D            
-15A83256            
-E560FDB            
-5749D494            
-61C88749            
-F7C9978C            
-41583770            
-73AF53AF            
-EDB828F7            
-5B9A931F            
-B33EEF56            
-3ED0DC67            
-915BF5B            
-CD090180            
-3659A346            
-E09A572            
-B0EB23            
-F35F97ED            
-8708879A            
-E3761150            
-FBCA868            
-8EE5D700            
-67931F7B            
-E3819B8F            
-FA9DD938            
-3C3DD434            
-FB62C866            
-9D6A734E            
-2BE14923            
-7ED6D7BE            
-423CF38D            
-CC4C4156            
-898F3254            
-405B1D62            
-25995FCB            
-C062465            
-12471B35            
-6DB351F2            
-5F23ABC5            
-49EF7D2C            
-91B401B3            
-85DE49E0            
-81D81230            
-9824E09D            
-767C5312            
-E0744F5            
-D99A77B9            
-7657BA4F            
-46CA1289            
-5D2AEFAC            
-ECDA74CB            
-DBA899D3            
-AFC6E7B2            
-DA79D8BB            
-F6508AA8            
-6D0E5BF            
-76DD66F3            
-DAA00B8F            
-C7EB98CF            
-65189199            
-FC2F2235            
-4F19D2CD            
-48D4E497            
-67A7643D            
-777B5F1E            
-2F089D44            
-4E841850            
-2D371993            
-B3ADA2E9            
-421A44E9            
-1D470C4D            
-81DA8998            
-71D42D8D            
-E5F09965            
-24BDEA19            
-F8FB47FE            
-1CA01D53            
-52A53F9B            
-B13279A7            
-840C17AF            
-F27507D8            
-36AA55D1            
-29616808            
-E5C25388            
-404F7A96            
-AF6CAD43            
-AA2A8D86            
-6D0D5DE5            
-B60B5047            
-F904AAE0            
-9BCCB969            
-73FFDDAF            
-AEC2E379            
-DDC3B6E3            
-85273FF            
-4F23EA7            
-F1048821            
-432CA7F7            
-FEEFB49D            
-2749D00            
-F0914942            
-878203C4            
-AB657B2F            
-FF754E6E            
-2A1B63BB            
-2B094F6C            
-8DD98DF4            
-7E8810E3            
-D17A81B6            
-BF297F6D            
-FAE3391B            
-B28655B9            
-2B4507BB            
-702B2563            
-FFC8858A            
-B8DF3A03            
-80018970            
-4387C2E2            
-81246EAC            
-1201F4B3            
-9AF9F9B6            
-29F63494            
-98A87F7B            
-C637C322            
-BCFB7066            
-3505C623            
-10BE77F4            
-BE44797A            
-2EF31DB            
-C8DB4396            
-FA7C2378            
-AD3C30C3            
-C3AEB714            
-58183DA            
-5D961567            
-1E42A328            
-94430ED5            
-866A3D67            
-84B148EA            
-C823439            
-80B57816            
-D6395105            
-B389CD22            
-B574BF88            
-F12CE1CF            
-C5B892E4            
-94F6CE69            
-9387A05E            
-C806C5C5            
-B2823B0D            
-64F1253B            
-DD3B64F8            
-4C6980E            
-BA9825C0            
-573D9CE3            
-A78DB442            
-FB5510FE            
-C45DE1A4            
-66DFA70F            
-47960901            
-68D725DA            
-ACAE1E6B            
-60F9360            
-8C9D39E            
-E78D5AE3            
-A1A0BB75            
-80E4ACAF            
-A0FD5042            
-5E0CBC82            
-C0474CF6            
-840ADEA6            
-6F972DE8            
-5D16E0D1            
-86688917            
-E08A3150            
-BB5FB87            
-2EE82F9C            
-62867EB6            
-B592C066            
-64852270            
-7A7634F0            
-58C6FA6D            
-E83506E1            
-7DC3ADA6            
-E972E4D5            
-4877FABF            
-CB37BA71            
-7BD3131E            
-9CA64901            
-C072094E            
-A28F50EC            
-CBBE833A            
-225D213F            
-D4266D98            
-3DA08099            
-22481B45            
-899C4804            
-3A8630B2            
-7227F512            
-FDA1F80E            
-E5515F91            
-6EECC93B            
-4611F561            
-47AD2CF3            
-ED2A807A            
-D694C082            
-6DEB43CE            
-9DBD4F70            
-8C918F0D            
-28C5219F            
-EB23A332            
-AAAACB21            
-9B053C22            
-6C5AEEBE            
-B1941AF2            
-DEFAA083            
-255DAF18            
-B513F3E8            
-CDE47DE0            
-43DD2231            
-71BA21A            
-AB772E2E            
-510C581D            
-93A91FFB            
-ED683872            
-E561882C            
-C503A74E            
-E274473E            
-3F7D95C2            
-AD48EE4C            
-887342AA            
-F4D0DC01            
-68023FEA            
-F996EC8B            
-F4E33500            
-8191511B            
-AFE0184C            
-8A6D392B            
-EDFEA13A            
-AC3E90B2            
-94E7E8DF            
-76F491E4            
-D45224EF            
-D32B9CD0            
-C7167945            
-2D56F7E1            
-994E7AAB            
-65EDCC15            
-AEAF497A            
-BA11EA7A            
-53D5812F            
-DF05201B            
-10A9356            
-ADAEF92            
-508293CC            
-B45B1908            
-DD8C2367            
-A385DBEF            
-A77E11BF            
-DE9B1792            
-A9FFDB94            
-AE48AD8B            
-E7798E96            
-BAAF5B51            
-44648397            
-80303BBA            
-FBE848C0            
-74F37EC6            
-C9C0EE6E            
-1D80DBC0            
-6CA37DEC            
-995387B6            
-BA2D99D0            
-D1869967            
-39D0BB45            
-36E391CD            
-12D6AB0F            
-4CB16A65            
-8BED7413            
-99987FE8            
-55BD54E3            
-5568C11B            
-F63606C4            
-AC4D0747            
-3032CADB            
-52407898            
-C461B987            
-1F3C8122            
-C7E1B1FA            
-BC1BF34A            
-724843D7            
-2DAB612E            
-F5180E4E            
-67FE89A9            
-B7641E8E            
-185E5197            
-5FDD9BA3            
-C6AC4D7E            
-DB020625            
-16ED5F8D            
-5A2DB8DB            
-58F7DE17            
-8231D332            
-9977723E            
-CFF39DC3            
-A8B71C3E            
-3335D9BC            
-D34AE6FB            
-31559150            
-E6494443            
-D6C0C713            
-515C9C4F            
-AA09B03F            
-EB32806D            
-981F48D            
-DAB324BE            
-33EDC165            
-88011009            
-F1120840            
-48119894            
-137409C1            
-7F45314A            
-DD74A5A7            
-C2251ABF            
-AA45B420            
-4ACBA24E            
-D020B449            
-50E55E0F            
-D78DD382            
-F6E82B05            
-9957DCE            
-1410E573            
-CA93CF29            
-83DBB1D9            
-7AD6D5D4            
-7921516F            
-8399BEB7            
-DF07D89D            
-77AB752E            
-6D6DBA45            
-890771BA            
-E87CBF52            
-F90A7590            
-78967761            
-6617D522            
-2EEDE919            
-F28BA9E9            
-E1E3AA90            
-2CBEBEF8            
-1D8A37FB            
-9CE04F02            
-680B5A92            
-561178BA            
-A19545D0            
-DBDA24E8            
-A7863CD1            
-F1B829CD            
-2BCBD34A            
-B8DFF2A6            
-2787D144            
-A075B93E            
-AA7BC361            
-B560CBA7            
-F8E79316            
-417B968B            
-9FF31C37            
-F88ADDD1            
-99A6E199            
-D3D400B5            
-79F33397            
-4AF6EA07            
-93EC79F3            
-F7D9C5B8            
-81D7EE3C            
-2898D7DC            
-4B8F67DB            
-D52D0F0B            
-10766E32            
-E228EA2C            
-54C96B61            
-74A99589            
-7E60A886            
-8FAF588            
-634DD09            
-1258CA8E            
-13E40785            
-20861E8F            
-69BF3004            
-E91E2BC8            
-583A44C3            
-36FD8D36            
-572B4202            
-BE43EB2C            
-65F871F3            
-723C1C02            
-65EBEF48            
-8DD407C6            
-513D6B1B            
-150993D3            
-4C771124            
-A18E6FE4            
-C46071C8            
-D824EA73            
-7A54B17A            
-4AB1E70C            
-F7D078B5            
-A315F9A4            
-9A39A8C8            
-CD34D2A6            
-8CDEF63D            
-B273EFA6            
-E15B8FB4            
-BA2A092B            
-E540DF83            
-33A3B82E            
-13BB16A4            
-4AA79F4            
-DCF1D80E            
-65B77A7E            
-80CB308            
-9A407BA2            
-D32D62B0            
-DB34DA97            
-109F323F            
-4B07538E            
-40AD97F            
-A810835D            
-6637380B            
-1ED7261B            
-DA642F4D            
-309A47D6            
-9009C0E9            
-7D9D6E1E            
-580CCE0B            
-67F92DAA            
-1936087F            
-342D9739            
-A191FAF4            
-2EF56C33            
-EAB9AD66            
-FB6E4FF8            
-E58333E1            
-E42B465D            
-2D61F572            
-9FA12447            
-848394C4            
-599C9E50            
-28675899            
-8610332C            
-968735B8            
-ACE06F66            
-266C841B            
-8512CA53            
-A25D3088            
-D55264D0            
-AC3678A9            
-D1DF668E            
-5BEBD716            
-DE986F08            
-17DB60F5            
-B88254C7            
-BCA0E5B2            
-E78B3459            
-494B6F35            
-5E0408F6            
-A8638621            
-62C27360            
-8D98C864            
-37EDB15B            
-ADC93344            
-4197C21            
-FEFE1A30            
-ACD03EBB            
-A3A230A3            
-45741EE4            
-DE86AD8D            
-CDBB302B            
-303A5D5D            
-A42863D5            
-9019ADA8            
-EB8E036C            
-A5558A5D            
-A4D5AF4B            
-F04E0726            
-C5AEA4BE            
-FCB9BC09            
-3FF2E51A            
-53E510E9            
-86FB3D5B            
-3031BBDC            
-1294451B            
-48879312            
-972E95C1            
-B8B861CE            
-FD180B55            
-F2930D40            
-31C5CF76            
-8C132827            
-CD696B0C            
-1446B194            
-436D712D            
-9089677B            
-493A420F            
-DF82C186            
-377516B8            
-20ED2C1E            
-956EA0C3            
-D26B4EEF            
-BFE59283            
-B4D36719            
-67B01DDD            
-6F3CA60            
-BF6B98D            
-1B120FBA            
-7CF4D06            
-83091BF6            
-7D3F5D85            
-D3E48FAD            
-E3025BBD            
-CA30F611            
-64D1D991            
-6A688C9            
-D06F9682            
-D346BF            
-E4DC58EB            
-4C4F7AB5            
-9D5CBB9F            
-5536C074            
-CCD9D1E4            
-FADD0C6F            
-769C50EF            
-A1F0E40D            
-72EF3FEF            
-C421D7AC            
-182D7491            
-3FDDA320            
-49F136EE            
-4EFABBAA            
-7228A4DE            
-40A616A9            
-EA37E4ED            
-5DADA164            
-2F9C5671            
-4D3D4CD3            
-3A68B35E            
-7A26619D            
-11A14309            
-D886253C            
-8F545687            
-3666D9FB            
-131A5557            
-9644C9A3            
-FCC47DF7            
-7CCDF226            
-9FCBB958            
-9DB97B96            
-630B5596            
-1B592B4C            
-2AB5341F            
-5817D559            
-3C0A5FBE            
-F65E3830            
-1D38ABAB            
-353E9D4            
-41647BE0            
-63DC6FC7            
-CABC6846            
-A7B8001D            
-2C018A1D            
-435D877E            
-3E5F838C            
-9709BC31            
-ACA0EA75            
-86A06AB            
-DBB06480            
-2A09283F            
-D3A83953            
-90967E13            
-D055B4E1            
-3365DA22            
-E3FFD521            
-50205ED7            
-E907F5E6            
-4D7D054C            
-C66CA376            
-2A72C5C6            
-793120B3            
-170AC5FD            
-C4CFDAA2            
-21A3CE3A            
-19F354F0            
-FCE7F112            
-279C9605            
-AA9FBB98            
-E269592C            
-B8E5DE7F            
-AE0A77D5            
-45B4CF97            
-6E9EE4C1            
-C31F7C62            
-D9E8C76C            
-75925FEC            
-EE34024B            
-73FEA2CD            
-BC601F7D            
-75776A1F            
-AC2A0090            
-AA6E1956            
-64C62B96            
-D73C3066            
-2F9C7E78            
-7F1529BF            
-5974399A            
-79D31554            
-2D559A9A            
-458A1BE            
-A820156A            
-26764010            
-981D62C3            
-A5C8534B            
-F8A5FAE0            
-69EA2102            
-2F62B77            
-2AE14076            
-88EB9A0A            
-36B5EF31            
-73E63D55            
-D6A15D81            
-F5C8A216            
-1EEFBC6A            
-8F16F5B6            
-87064008            
-7EEAA78F            
-35A4B04C            
-AE70F49            
-9642CC0B            
-3199A9B1            
-F0E6FE1C            
-F682DFA            
-E500C5B1            
-AA1132D6            
-3B3A2D9F            
-86C9A21E            
-BE1422DB            
-2218AF29            
-64512A76            
-C4624FF3            
-F4E52FE4            
-8473989E            
-269C4193            
-B67528F3            
-76FD1A6F            
-ACF6869B            
-DCEBBBFD            
-3ED92226            
-3FEA0905            
-2C4A131E            
-4CC5DF7B            
-63E3A62            
-988BE035            
-BB06A621            
-61C2E087            
-C2E46B3F            
-78010D43            
-9EC6DFEB            
-3781CAAF            
-6D000EA0            
-7E952EA8            
-2874E849            
-FAA54995            
-45DB5F56            
-8CB1094F            
-336FA04C            
-8CCD3F1C            
-A40704F0            
-7AC652EF            
-83E998AF            
-8167F5FD            
-AA7527B6            
-543AF979            
-F21F16B6            
-9A4E00F            
-1686D0AC            
-FB0EF404            
-EBA9E0F4            
-1A9BCC03            
-F66D4C53            
-4328EB30            
-DF52A096            
-4A61DDDE            
-3F19448E            
-5F3E0EDC            
-C9FEB2B1            
-D8EDCB6            
-4EAE672C            
-47FB8C0A            
-B4D64E67            
-7F5AA323            
-38796C27            
-3ED30872            
-6241EEE1            
-AAFD55B6            
-F31CA43A            
-54CE5828            
-6D9103FC            
-665303B            
-ACD9B1CC            
-4961E187            
-EEDB6D29            
-544577B0            
-9CC76FDC            
-718802FC            
-2EDC02F0            
-6735768            
-FC351962            
-30F3C426            
-7BD3050D            
-4C19A7C            
-97DC5F3C            
-720D7F42            
-2F735FAA            
-B067A6FB            
-4F5EF847            
-F500ABE8            
-FD9E7B9E            
-8C37652E            
-B6189BE1            
-BAEF411D            
-2584FC7F            
-FEA99C78            
-873C71EE            
-51491598            
-8BCC9600            
-60A2176C            
-9D6D9475            
-94E1A54E            
-78124EEF            
-4DDDA3D5            
-DE77F79C            
-67E3A57B            
-1E75B5B5            
-290C7ADC            
-30FDC46D            
-63BDBBD7            
-9E61B234            
-666593DE            
-8C7C1E27            
-9C723CAF            
-EF1F2DDE            
-CA69CD52            
-4DE571F3            
-A0AD3A46            
-902EB90            
-D761B7BB            
-9F209F04            
-15B1B5F            
-5C389CFF            
-B736B159            
-97994EC            
-A2DBE074            
-353360C5            
-19E771B            
-94A72285            
-2F4706A0            
-64CC6476            
-627BE8B7            
-90FE94EA            
-7D02778            
-2EEDEFD1            
-9A5EF7C            
-E7B7B437            
-F21A3517            
-F33DF1F0            
-7A865164            
-4BFE70A7            
-88A8B45C            
-C0D320E2            
-E93442D3            
-AA086067            
-11B873ED            
-1BE002FE            
-2E799A3            
-2AACAAA0            
-EB1A91C7            
-9FA88D6D            
-4D956843            
-75FB8348            
-1584A0EB            
-4C9D1E1A            
-413548BF            
-FA0CF448            
-90D1256            
-BEB74BF9            
-EE7C6510            
-765277BA            
-A6081E2D            
-E616DE16            
-EDFB0495            
-12EDC382            
-DA64FCA3            
-E258DCC3            
-92E0B54B            
-B41B389A            
-D818F160            
-F8F1A55D            
-17916C31            
-DBC21683            
-3272DA3            
-931C08B3            
-9F8EA606            
-232CB0D7            
-EC870992            
-B5F586AB            
-3ECEF68A            
-BF7BE567            
-2C009224            
-C2BE6397            
-90EE0A64            
-FC3E6BC3            
-F1190F98            
-1D05D7F8            
-52AA90F8            
-FF7C45B0            
-7F5579FE            
-6609C7B            
-9B56CD69            
-4A6830B1            
-ECF9E86F            
-62331FA4            
-294B7FAB            
-DC7DFBA7            
-4DFA98F8            
-CA6447C5            
-B0416FDF            
-5FAD4523            
-BBBEA8BD            
-47DA6D1D            
-FB598321            
-E4A1EBBB            
-DD0CD41D            
-77FC8F60            
-E4D74C7F            
-E4B2B064            
-52EF568C            
-91E87E37            
-FAF6069            
-6E28131E            
-4D39B103            
-59A3C4EC            
-3AA49C6E            
-D90E743            
-44FC3B9A            
-7D181041            
-AD89A0E7            
-616A565F            
-129B06C1            
-907298A            
-5E98085E            
-9648A06            
-4FE2BFCA            
-F73FCCCC            
-62DC849B            
-BB543EC0            
-EF301310            
-9801EC66            
-43557EE0            
-2C382E49            
-5151FB5C            
-3C1DCC5B            
-DD1C153B            
-77B3F30            
-FDE0F3E1            
-C967E75E            
-D5C68278            
-6CC1FA37            
-A3FED046            
-5DE77F4E            
-FB7F40F6            
-2C9191BB            
-D089B672            
-1E9C6BAC            
-756468C2            
-13352B81            
-D2CC73C6            
-55B4D4BD            
-8D6BD8F4            
-65F7C5C0            
-34A629D9            
-79424449            
-1CE03FD7            
-451FC3D3            
-255B39FA            
-F5F01286            
-D1623E81            
-4B33EB3D            
-CB2326EC            
-9C1189DE            
-1ED995BA            
-1298FE00            
-A5FDB07F            
-D80D48D            
-575374E6            
-3664F373            
-5ED3FE            
-2171B235            
-413BEA38            
-FD67D4A            
-34F10135            
-F4544A59            
-16BA37D6            
-649879DE            
-EE8D839B            
-A545FEF1            
-4573F79            
-D53FE034            
-F4418DBF            
-92181012            
-FB81741F            
-376DF3DE            
-19763A21            
-47FB6EB7            
-7F997F6A            
-CB94D301            
-36461AC2            
-A3C2378C            
-2541AE5            
-67D92471            
-EC619D04            
-3BE21ECC            
-A441FB3D            
-A19F0955            
-39492084            
-6C680626            
-C8D37B17            
-68B215A0            
-8B3846B1            
-9B21F1DE            
-8021097            
-EBCC81B2            
-E9310566            
-AD50FB31            
-AF65F01B            
-739CBC38            
-35573201            
-F7F58733            
-4015ACA            
-6AA65104            
-33202FD0            
-B5B1AE8B            
-C1C66F1C            
-8BA3BEC9            
-E55A2ED0            
-49ABBD4B            
-42DD0652            
-A936340A            
-8EE63409            
-5C64BE2D            
-4D47E9F            
-745994DC            
-7CCF78A6            
-516C7BF5            
-395F9C6            
-58E11E54            
-73EAA341            
-E2D4631A            
-C3552D0F            
-4CF36F47            
-3FE7034B            
-EEFCB8C6            
-8219943B            
-E800BB09            
-55544B91            
-A3292FE8            
-89BC5746            
-F63B4EE1            
-E866DAF9            
-E99B2D4B            
-BB57E938            
-34FB7E1A            
-EBB559C1            
-24838BA            
-48075561            
-9E621607            
-998E5D98            
-DFCF97D6            
-2ECF6FC5            
-15EE774F            
-C3E53B77            
-8EF5F879            
-763B1F55            
-5C90BD9            
-267E7FCE            
-625E8032            
-F12724C8            
-635FC29F            
-36AF3D44            
-B7D2299C            
-6E8F0DBE            
-A76006D5            
-723C72E0            
-ECA467C2            
-5C7DFAD4            
-23AC163E            
-F306D785            
-67972062            
-57D31D2C            
-4038D82E            
-D21756BD            
-257A9123            
-BE96CEDC            
-917019D1            
-362C4F33            
-2A305FAF            
-D4389CC3            
-4C435238            
-D68F1F0C            
-372B2979            
-A7D6B646            
-53A2E4C2            
-19E556E            
-62D716A7            
-64918481            
-4D3AA8F0            
-BA8C6B54            
-2468C102            
-499AD5B3            
-81AE28CD            
-42E94077            
-C969675A            
-341B58FE            
-41159415            
-ADE3FA94            
-FF5F42BA            
-379C83ED            
-A7E678F            
-C2D60CBB            
-CC75230C            
-A12B9169            
-9CF6EE67            
-2DD905D3            
-EACCF580            
-367F9A41            
-477BB16D            
-8438B576            
-756D14EF            
-980599BD            
-C181C6AD            
-99A3EF95            
-151D4F12            
-CD85DFB7            
-695F12C9            
-4CF48772            
-CB00E50D            
-B9E2AF4C            
-97EC19E3            
-54810B59            
-EC4F2D89            
-ED77DA60            
-19451088            
-D5A52E95            
-F6FAA3D3            
-F2458DDF            
-D5AB6D8            
-D4042924            
-AEBEC90            
-505DB6D0            
-52505B2A            
-ED9CB8B3            
-DB06312E            
-C508C5AF            
-4279ED2F            
-5C72A874            
-15E22E84            
-54E967EE            
-80A13FE3            
-EE346264            
-3569BCA7            
-9AA9263B            
-2BEC95EA            
-966F3368            
-B74F6A2B            
-25ADEA56            
-30A1BCE9            
-71EE7AB3            
-74807D9C            
-E4C0D662            
-A62305A1            
-6B9FB6F0            
-C2CAB758            
-E3FA413E            
-5266648            
-754C0A13            
-C4FD0D47            
-BEFA676C            
-786AFDA7            
-297AA674            
-F2895DA0            
-72A98C20            
-A662B307            
-54DFB586            
-8147050E            
-CF7C5819            
-760EC4AA            
-F011339D            
-2D496BE5            
-6FD43E03            
-1DFD893E            
-814ADCDF            
-B7C38DCA            
-2149763D            
-EB58B9BA            
-9F1B81B2            
-94C15E0C            
-5A9923B7            
-6C4E0E11            
-C63C3D44            
-BF9AA840            
-1A3E83C5            
-B81CEED7            
-7E9FD999            
-C1A15CFF            
-B28F657F            
-287D5990            
-8DB5B01E            
-E241144B            
-EB0EA64E            
-884A8775            
-99F5DBEA            
-3DBB21D6            
-CC9472CE            
-B932014E            
-22A35325            
-7B22DCF6            
-882BB2C3            
-B47CDAE            
-28767633            
-ED17CB12            
-6302A17F            
-25D91C08            
-4D61BFB6            
-FA240AD0            
-E9DBF560            
-F0E9AD0E            
-835C152D            
-61E5F126            
-C176F8FB            
-B793DC1C            
-622E04B            
-D9FB6072            
-60124DA7            
-8BEA323D            
-6C496459            
-FBE1E578            
-F1C73C9E            
-6A7C4C58            
-43F1DB50            
-E9BF93AC            
-B7DC5C72            
-2E68083B            
-F3DE081F            
-AAA39D71            
-73406424            
-B99D0139            
-E4FB0C67            
-142AB82D            
-3312CC57            
-7A3BEDB7            
-6B6E42D2            
-F8330EA0            
-2FE05DA6            
-3E6BB118            
-3C73E09            
-5FDB1471            
-6A226A31            
-88792727            
-78708ED3            
-7A095177            
-9CCAD23E            
-C3B75180            
-226F8D4C            
-46DD1DBE            
-D799BE11            
-1F852432            
-7361585D            
-97380EF8            
-4F1A8127            
-2EB7A73C            
-35B892A7            
-933075A1            
-2B6D3BEB            
-BCDCA6F1            
-E9409A22            
-3A8E5575            
-E37AE0CA            
-97C2866C            
-BA575BC0            
-C16049A3            
-79FED5B1            
-6356E153            
-98789BE6            
-47B95292            
-FBDEC30C            
-2275A4D            
-632C436D            
-FDCBB3FE            
-4E0ACB8D            
-36A77186            
-593FDA25            
-D9B74A5D            
-18021557            
-3919EF9B            
-DDD00927            
-B0C6DFEE            
-F761C0C7            
-886DBB5            
-807A21DF            
-778F06D1            
-27A67D08            
-2CBBD43E            
-2696EC44            
-1F916066            
-DE884377            
-1472CADD            
-F30A91AE            
-89C35DEC            
-84E5487E            
-792613D4            
-1E59B1A9            
-B18BF896            
-8D7034AC            
-A144CE10            
-F2FFC2AD            
-2F5FBA7D            
-FFEDDB97            
-7C506BFD            
-85B811DE            
-CC3AD4C0            
-B6CC2F1            
-BFD63C90            
-281E81D7            
-89E82B39            
-E5371DE9            
-5BB68ED3            
-3DA62382            
-3C8CBB1D            
-4BE92297            
-878783A4            
-F925E76B            
-77DE554E            
-7EB5914E            
-9B3F869E            
-F47FA82D            
-23E861F2            
-19E38BDE            
-C26E5CA7            
-317C9C64            
-B96B12FC            
-F6EB43AE            
-F979DCAE            
-DD5BE081            
-5B11401            
-3C4A8866            
-38C6F309            
-2FE6DD71            
-84E2BDC8            
-2FA36F63            
-F0D171C            
-8AAD8CA5            
-92D5E506            
-D4CF4E62            
-82DFFC21            
-2C686264            
-CDDA9A2B            
-98CF101            
-847DC151            
-C0FEC6AC            
-A1638360            
-DD36C966            
-A6A8635A            
-F700C63D            
-48377DC5            
-138CB9D1            
-857331B5            
-4844609F            
-E29224CA            
-A5079F42            
-3B39EA92            
-F020BFFE            
-4859CF8E            
-7C1B1E1E            
-DD95482D            
-24C31760            
-3555FB83            
-B1D20BED            
-403E6587            
-D04E4309            
-74F63A1            
-EAFDC6CD            
-781795C6            
-BA9A1FD1            
-60F61FF3            
-B93EE92A            
-7BCCFCDF            
-477FB17A            
-B508142D            
-D2BC8CD8            
-F11D8200            
-24A8149A            
-8F00F213            
-3822F374            
-E37B6219            
-4727F504            
-12CD7551            
-5FD2779            
-E8EC01F6            
-29CE5CE4            
-1EDDBCF9            
-69AFBC0F            
-11B3CB87            
-E39AE82B            
-E66CDCBF            
-6824DB75            
-7183BE54            
-12A11956            
-ADA59196            
-437E5E61            
-F1A7F4A1            
-671FDE0A            
-9202817E            
-33ABACB2            
-B0705AB1            
-39952407            
-D3672EB1            
-A03BD94B            
-B46D2252            
-1DC47573            
-EE4C78D4            
-B6E4D8E0            
-12C2206A            
-5656E1EE            
-4D9D4988            
-35E36416            
-3AC9C8F2            
-2161B02C            
-1B5A8615            
-62587331            
-CC4036C            
-EACDCEC6            
-F40C98DC            
-9C8FFDE9            
-D87FB3C0            
-C55AABE7            
-1BE31E0B            
-C0796911            
-C08C311            
-E41B196D            
-E4FFB7A3            
-2483C766            
-FD348C63            
-F294631A            
-7B74B50A            
-D6416CD9            
-66559F6C            
-A7CE68E0            
-ACD88C63            
-BB49939B            
-7987A018            
-E1797428            
-CE39ECE8            
-D7B3DA7            
-8F2A3F0C            
-37E3C72E            
-21F1A24E            
-57AFCEF2            
-AB8CF2            
-15B5A4E9            
-94094315            
-29C3AEB6            
-A56B4233            
-6D57E64E            
-3A7399D2            
-103AE960            
-8B93E67E            
-D5193079            
-767DA47D            
-88AEDE6F            
-ABCFBF34            
-2650782C            
-7A716475            
-C86C9BBA            
-4423420D            
-3AF8FD02            
-72E202EE            
-5A264F7B            
-4E103072            
-4DA5A0E0            
-59319F97            
-B54F9AC            
-556DF0B3            
-ABAD7DC0            
-2A715C13            
-9D443D0F            
-54BDC92C            
-1EC2B967            
-80BE3AC2            
-FA646E8A            
-2EE396F1            
-8B0315E8            
-9F52B6E            
-DAD30422            
-2E9B6CDB            
-8686D47A            
-5D9DB3C7            
-717E799B            
-20A4D4E5            
-C2DC8AE4            
-F630FADD            
-8C7DF047            
-65F4928C            
-BE66D11E            
-6004484D            
-C1B509AB            
-FAA4C75F            
-B3D272A0            
-7FE6F083            
-A54B6584            
-FC3292F            
-4D27DDFC            
-A1ABC224            
-872FED55            
-D235AEC            
-27ED8546            
-1B170B2A            
-CE9E5C0            
-2267B02            
-285992BD            
-F855CC8            
-8FFB1F6F            
-C7BDDF81            
-349B4F5F            
-B9B28843            
-D5D532A0            
-8FD7BE3C            
-2DB04DE8            
-C7D0C2FD            
-B6822987            
-1FE0710D            
-8EADA490            
-A03F99CF            
-F3E7F902            
-F56CCCA3            
-CED5B6BF            
-D6B3DC0D            
-92AA9FE8            
-351208D            
-A1C9623B            
-5802547D            
-3480D77C            
-404D4E65            
-679025BA            
-905FF962            
-B7130CA8            
-5AFA9CFE            
-2A654EFC            
-26218A8            
-473A88A            
-5E3534CC            
-771FF1E1            
-EADD6296            
-DF7157B3            
-D48E42E8            
-3D6E848B            
-29CD6C            
-68732656            
-A6C6D52A            
-B50279FF            
-705B645            
-6DF7F119            
-34152606            
-72948D92            
-18BEE72            
-36BE21E3            
-C34FD53A            
-9765DFF            
-E5C9B4AF            
-4604B155            
-DEAC2388            
-7841FE0C            
-2E275885            
-3EE65330            
-EB66439B            
-FF4AB5DE            
-67EDA5EA            
-BB722F57            
-6A645B7            
-DE9DD302            
-5AC7601D            
-371B5D5B            
-42BAC84D            
-21C7AA9E            
-F4ECBE94            
-554C8B8A            
-B7C8BB88            
-4C77DB1D            
-D4D8F3AC            
-DAB292E5            
-85D906E8            
-47785703            
-9CEE88D4            
-7DB86DB7            
-694B5A34            
-DE77B361            
-E8DE3CB9            
-315EC35A            
-A71943BC            
-C297B8CA            
-55EA528C            
-A11AF15D            
-1490835E            
-19DA117B            
-403B0CC3            
-FF7DE389            
-ED6C22E8            
-6F8A8782            
-7BF2BA9B            
-6C95F5DF            
-F8270769            
-AB421268            
-F06B05EB            
-8FF7DE5F            
-F2AB2FCD            
-A5EDD602            
-31F05712            
-3C269177            
-67D92F11            
-38D8D3C5            
-2047013B            
-8E8BA724            
-EB6A773            
-5AF14AD1            
-49910D46            
-C9D6F784            
-B44B09CF            
-1AEA48EF            
-2F12BD47            
-10E3F7C9            
-39EA8108            
-B88ADC9            
-19DAC1B4            
-554908DC            
-587A0A7E            
-109D1E5B            
-1920E3CF            
-BC49C914            
-C1EB74A7            
-A5E9A494            
-5FA5B8C9            
-320673C2            
-CE643004            
-720E4075            
-FDFED2FE            
-89C22F8E            
-40887408            
-3235FF6B            
-A906F59D            
-F6F98F12            
-7122ECA4            
-4CDFCB42            
-391F2365            
-53AE3667            
-6CCCE2E2            
-44877A8A            
-92561CAB            
-DA5DE0E7            
-73B898D6            
-2E37229E            
-ABAAED3C            
-21087331            
-58C85412            
-8BB37690            
-1256467F            
-6EE9FAF7            
-DB0895D6            
-954EF968            
-1C7693BC            
-5786650F            
-7D441E12            
-10AA9174            
-492C6A3B            
-34374CC9            
-98E59E7C            
-5B7BD4E0            
-D1124C9F            
-B5B3362F            
-8ECC58C7            
-8EB0E23E            
-72991400            
-13DF853B            
-789E8DFE            
-D85E60DC            
-A168D4D            
-C3B6FA3A            
-11443EE2            
-F63F9FDD            
-1A14A7A5            
-5EEBFD5            
-B24D582D            
-AEA8F125            
-4AA038EE            
-5F6A1A16            
-CBADD812            
-340605AA            
-8BD8F6E9            
-B85F3A6A            
-A585AE8C            
-6D12D2B3            
-17C97329            
-DBB835B9            
-789C3DF4            
-E048D462            
-BECE080A            
-506DE5CA            
-63C4FA5C            
-7C2D8103            
-689A3516            
-B218BADF            
-8B7F0BDE            
-85B17891            
-8888A9C6            
-3DFC9FA8            
-5F2859CD            
-FF72AE34            
-9EA3FFCA            
-CF2194D2            
-53B56E7F            
-C7009619            
-B127FD51            
-3A513DF0            
-E9147D4B            
-2FDF3C37            
-22FA1629            
-61480015            
-57EE267A            
-EE04DA43            
-EB2D289C            
-2C102144            
-B012EED            
-B1B339C8            
-AC1EA89            
-3A4420D0            
-5623907B            
-B0613D35            
-A70F1B2C            
-589E3EA7            
-F998AB7D            
-9566E921            
-B133DB2D            
-A3106F6A            
-EFB4518            
-6AA3FB8F            
-C505C8DF            
-65032E33            
-6D3942DF            
-333553CC            
-BF392E2            
-6C77F980            
-39211AFC            
-9E0B71C9            
-A3BB7123            
-7CE16B9A            
-F15BB634            
-BD68DE3E            
-77BB27AB            
-BB72659C            
-BFA916CA            
-7022CF20            
-EA64C93D            
-B61C32CC            
-20201879            
-148DDADC            
-58977            
-8D5CC2E6            
-76E678BD            
-5655B362            
-587EAB4A            
-599E3DCF            
-7B470038            
-E87E82DB            
-9088EC5E            
-ED9F9E4C            
-3DD98E27            
-5AFA5052            
-3DF313C4            
-BB22A60D            
-44D97BDA            
-601409F3            
-CD1D3CFE            
-7EAE52D0            
-41ABBAA0            
-A1D7C883            
-FFE2B4C9            
-13717374            
-9DD27EC8            
-29301EF0            
-87953D6C            
-9309161C            
-C91DFE7C            
-DD5EC452            
-F6C27DF2            
-43B433FD            
-6D16B93F            
-92F09DBA            
-ABB598EF            
-B49A721A            
-3A03EE56            
-3177D3AF            
-5D24FD94            
-FEF88FB2            
-52B3170F            
-64264DCC            
-18B683B7            
-6B21935F            
-901A396C            
-4601FB55            
-51F2547E            
-DD37C23B            
-35E6B3DF            
-31ABC979            
-C7223449            
-ABCA9CFB            
-A8F57AFA            
-A097240            
-78704130            
-7F1D7661            
-456C2409            
-63E31F62            
-FD0D4BB1            
-97FCC39            
-951A7C93            
-893165C9            
-E86163CC            
-25F5694C            
-8890910A            
-43F3AE36            
-55D414A1            
-1ADDD3BA            
-C7EDFDDF            
-5A8607BA            
-219D3208            
-27BD79E2            
-2E9EA4B8            
-5D8F951A            
-F9E880D5            
-B2C7612A            
-862CCCF3            
-7EDC71AC            
-1B6EA644            
-EC3AA9A0            
-970224FD            
-6C0DD16A            
-C589D1B6            
-71AC91EE            
-C75B0206            
-50232786            
-316AAD4D            
-F4D5A31B            
-E30CCF43            
-BD72BEAD            
-26DE4F8F            
-56E97741            
-9243E978            
-F7E2363D            
-BAE2CF31            
-6367CFB1            
-B72ED4E6            
-75216393            
-4626E74F            
-61194364            
-8D6726A8            
-458611B8            
-1B536E4D            
-837AAD1F            
-F5A226D8            
-8BB37701            
-31F19003            
-8E48DEEE            
-9DA11E9            
-3BBB5BB4            
-C6F15B5D            
-1A53A4EB            
-69AADAB            
-4FAE6295            
-F0943601            
-A449516E            
-BF7EE395            
-176B1370            
-F55873EE            
-553FEEF0            
-9F3AB09            
-2539B92E            
-F6803BC            
-BAA192FB            
-DBB0AD5A            
-B9C5415            
-F92D0588            
-88B9E738            
-A033C767            
-A1CA1EFF            
-5AC07200            
-AC60C03D            
-17FE20F9            
-B898B9AC            
-51AF425E            
-2706FC42            
-F2A258E7            
-353652D7            
-CF3F89EE            
-63A13050            
-5E6A7997            
-153FD92F            
-1D0E8614            
-6E504447            
-5AAEC133            
-9B6E5499            
-64D5EAE6            
-A29CFBAB            
-52B44B68            
-8DC7C01A            
-704EB2F1            
-395F1F7            
-7D897418            
-2FC66846            
-ECCE81AE            
-21CD8E31            
-B2EFA3D4            
-16C4CD41            
-D6A21ED0            
-944897F9            
-F495D730            
-B4317C3C            
-8C074582            
-22F6A9D9            
-CE4425FB            
-FB08BCBA            
-DF07A006            
-293AD5BA            
-BD224A44            
-9DA6701B            
-DAB46DE4            
-9F88773B            
-57CC02C7            
-7A6B68E4            
-55A54D48            
-BCFC1C53            
-DF64F920            
-A9FE6014            
-4C64DB55            
-5FE9345F            
-412A1E48            
-45D41945            
-23B44D08            
-8D5563A2            
-26E5E437            
-CECDF4D0            
-1BE55025            
-84329F92            
-37C97F8F            
-C3CDE976            
-580955A            
-C79E1131            
-C5BC58E7            
-7D14509B            
-3DE94089            
-1B78FE71            
-49A0ECD9            
-501D09B1            
-F30135CD            
-B0FA41B4            
-33B11313            
-32AB01B            
-635EBA76            
-666D7FE5            
-68CCC93            
-59B0ADA5            
-B305CBAA            
-1C553509            
-5E564F7C            
-F057084C            
-52811FC8            
-987465B2            
-461DA750            
-F0C471BB            
-3C9D3E64            
-73C920AF            
-355A26B9            
-3A1FDD13            
-CEA3F7DD            
-66C0687            
-1319291            
-9045182D            
-174C724D            
-2A491012            
-BA53519F            
-A62B41D8            
-F6E1559E            
-25F93E6F            
-2A40C5F4            
-C63D1AC2            
-82598002            
-2B81101A            
-63442848            
-3788BB2D            
-74DDC016            
-214CE0F4            
-9CBAA8BD            
-9288E1AC            
-EF76E528            
-719E7BAE            
-BD579EF6            
-4E6B0C62            
-6285F757            
-9049BDA3            
-80BFE3C1            
-4344B7A7            
-4552F1DD            
-DE2C0DAC            
-86346BE2            
-A0A897E7            
-1797D93            
-6CF3C7F0            
-7592D9E7            
-CFB46F1E            
-17D6FF93            
-87FF1727            
-198FC755            
-303540EF            
-78C07416            
-46CB391E            
-8D441653            
-3724DA3C            
-860D4DDF            
-A99F046E            
-4B167D86            
-E2AFCBE9            
-6608F2D2            
-4E49A130            
-3C64B760            
-958BCEB3            
-8C784B24            
-5E07EF07            
-7E6CAC6A            
-B69765D8            
-65897B6D            
-60A8FB7D            
-6706E0E1            
-142E4310            
-15C4944C            
-F6A075AD            
-3CF66DF8            
-CE1EFE72            
-D6495864            
-2BDEFA6B            
-9E511045            
-F2E2E9A7            
-B71B03EB            
-15DD8D69            
-65E5A555            
-52C644AE            
-301A8F69            
-35075232            
-17ADE8C4            
-A2C808CC            
-F1A4C57B            
-D6EE3EF3            
-85942F72            
-26011F23            
-D4211E97            
-595E1A12            
-6886CE0            
-FBD6F396            
-D10BD980            
-6615476D            
-4662EB8F            
-F80BE955            
-93A6E68E            
-4C3D4CAA            
-5838D0CB            
-756FB6E4            
-F0BC8312            
-EB89BE83            
-D34E119E            
-34F860EC            
-F371DC73            
-BB166E0D            
-CE86AF89            
-C177E633            
-A19C1D9B            
-B1DCBF1B            
-D7310057            
-2452939E            
-120A830            
-F92A9928            
-64877B92            
-3D69A585            
-178187B6            
-146C0495            
-9A3D8886            
-C79478AD            
-9A429976            
-29795A97            
-32BD0034            
-1EE08CD            
-8982284A            
-ED362AC4            
-4A1AC734            
-6FD164B3            
-422ADEBA            
-9374B593            
-BBFA8568            
-1C0B26A5            
-5DF68365            
-CFA1D689            
-1C9509C2            
-1056EAC4            
-D492D000            
-64076487            
-2C1FB65B            
-9E1DEBC7            
-C5AECD05            
-39652664            
-57A1B9F9            
-3652484            
-E8CCF72B            
-CB7EC405            
-7DA97E78            
-7ACE1B2C            
-A5DC0B75            
-40C14422            
-777B17AF            
-5AA3FEDF            
-319C2B1C            
-AB8EEE5F            
-159D66E5            
-3E479D0            
-12AF93DE            
-55EA550A            
-38853E1F            
-FB943864            
-781FA52E            
-4FB9C9FA            
-377D8866            
-8411E296            
-641D997F            
-1933684F            
-27A62DEF            
-50E15F68            
-755BCD7C            
-5DF3466F            
-494A937C            
-8763C6BD            
-C04B98E0            
-E9E067FF            
-444151AB            
-C5FC7398            
-5EC7D30E            
-E0610B7E            
-76CEBB5            
-B15D9821            
-37B2D1E2            
-CC1249BF            
-3E064388            
-246B17B3            
-4A342228            
-529E849B            
-F25F250D            
-31F3E925            
-D1112DCA            
-DA6A8BC9            
-2A7789D8            
-C0C2C72D            
-4BB23226            
-68166638            
-4EC7519F            
-D559B4B7            
-8035E823            
-DFB06DE0            
-2B4B86            
-83D6F12F            
-84AC7F7B            
-7139E98B            
-C42D8AE3            
-2992AD9C            
-E1E24DA1            
-838772BD            
-CA28D517            
-3606947F            
-B9FDFA59            
-6C4F8489            
-76DBFFD4            
-3F0BFDF6            
-1B04AD1B            
-8BA40134            
-842A54F6            
-621A0DFE            
-1F3729FC            
-C53AFEFE            
-CD5F1E79            
-D2C0C70            
-30A4FF4F            
-D384C76            
-D73B9B17            
-C74DC3F9            
-E5ACD113            
-901E6D5D            
-D376A71F            
-57BA08F9            
-17E25669            
-F7485021            
-BCD1B9C5            
-90C1A916            
-EEF9DE6E            
-6AD37907            
-40B05A7B            
-4A56C1D            
-901093E1            
-5424EEE9            
-3336300D            
-8B1767F3            
-707A4B23            
-37290194            
-13A5E016            
-C25902C0            
-5C04C3AE            
-B7D84F4D            
-D57A495F            
-EE168042            
-1584DB78            
-7DBFDBD3            
-DBE2218D            
-9EED8CD4            
-2A562C0F            
-C76F7E04            
-8FCA82B8            
-7211C54F            
-8E76E82C            
-9BAF59A6            
-C1E7B9CE            
-28E9E29F            
-6746FB40            
-7841DDA1            
-37D07C7            
-88A5CF5            
-4B0B8A4E            
diff --git a/finn-rtllib/memstream/sim/memstream_tb.sv b/finn-rtllib/memstream/sim/memstream_tb.sv
new file mode 100644
index 0000000000..4b2e850415
--- /dev/null
+++ b/finn-rtllib/memstream/sim/memstream_tb.sv
@@ -0,0 +1,212 @@
+/**
+ * Copyright (c) 2023, Xilinx
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice, this
+ *   list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ *
+ * * Neither the name of FINN nor the names of its
+ *   contributors may be used to endorse or promote products derived from
+ *   this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @author	Thomas B. Preußer <thomas.preusser@amd.com>
+ */
+
+module memstream_tb;
+	localparam int unsigned  DEPTH = 256;
+	localparam int unsigned  DATA_WIDTH = 32;
+
+	// Global Control
+	logic  clk = 0;
+	always #5ns clk = !clk;
+	logic  rst;
+
+	// Configuration Interface
+	logic [31:0]  config_address;
+	logic  config_ce;
+	logic  config_we;
+	logic [DATA_WIDTH-1:0]  config_d0;
+	uwire  config_rack;
+	uwire [DATA_WIDTH-1:0]  config_q0;
+
+	// Streamed Output
+	logic  ordy;
+	uwire  ovld;
+	uwire [DATA_WIDTH-1:0]  odat;
+
+	initial begin
+		config_address = 'x;
+		config_ce = 0;
+		config_we = 0;
+		config_d0 = 'x;
+
+		ordy = 0;
+
+		rst = 1;
+		repeat(16)  @(posedge clk);
+		rst <= 0;
+
+		// Write Parameters
+		config_ce <= 1;
+		config_we <= 1;
+		for(int unsigned  i = 0; i < DEPTH; i++) begin
+			config_address <= i;
+			config_d0 <= i;
+			@(posedge clk);
+		end
+		config_address <= 'x;
+		config_ce <= 0;
+		config_we <= 0;
+		config_d0 <= 'x;
+
+		rst <= 1;
+		@(posedge clk);
+		rst <= 0;
+
+		// One Round of Stream Read
+		ordy <= 1;
+		for(int unsigned  i = 0; i < DEPTH; i++) begin
+			@(posedge clk iff ovld);
+			assert(odat == i) else begin
+				$error("Unexpected output: %0d instead of %0d", odat, i);
+				$stop;
+			end
+		end
+		ordy <= 0;
+
+		// Full Parameter Readback
+		if(1) begin
+			automatic logic [DATA_WIDTH-1:0]  Q[$] = {};
+
+			config_ce <= 1;
+			for(int unsigned  i = 0; i < DEPTH; i++) begin
+				config_address <= i;
+				@(posedge clk);
+				Q.push_back(i);
+
+				if(config_rack) begin
+					automatic logic [DATA_WIDTH-1:0]  exp = Q.pop_front();
+					assert(config_q0 == exp) else begin
+						$error("Readback mismatch: %0d instead of %0d", config_q0, exp);
+						$stop;
+					end
+				end
+			end
+			config_address <= 'x;
+			config_ce <= 0;
+
+			while(Q.size) begin
+				automatic logic [DATA_WIDTH-1:0]  exp = Q.pop_front();
+
+				@(posedge clk iff config_rack);
+				assert(config_q0 == exp) else begin
+					$error("Readback mismatch: %0d instead of %0d", config_q0, exp);
+					$stop;
+				end
+			end
+		end
+
+		repeat(6) @(posedge clk);
+
+		// Another Round of Stream Read
+		ordy <= 1;
+		for(int unsigned  i = 0; i < DEPTH; i++) begin
+			@(posedge clk iff ovld);
+			assert(odat == i) else begin
+				$error("Unexpected output: %0d instead of %0d", odat, i);
+				$stop;
+			end
+		end
+		ordy <= 0;
+
+		// A Round of Stream Read with intermittent Read Backs
+		if(1) begin
+			automatic logic [DATA_WIDTH-1:0]  Q[$] = {};
+
+			for(int unsigned  i = 0; i < DEPTH; i++) begin
+				do begin
+					// Randomly delayed Readiness
+					if($urandom()%5 != 0)  ordy <= 1;
+
+					// Issue and Check Random Read Backs
+					if($urandom()%9 == 0) begin
+						automatic int unsigned  addr = $urandom() % DEPTH;
+						config_ce <= 1;
+						config_address <= addr;
+						Q.push_back(addr);
+					end
+					@(posedge clk);
+					config_ce <= 0;
+					config_address <= 'x;
+
+					if(config_rack) begin
+						automatic logic [DATA_WIDTH-1:0]  exp = Q.pop_front();
+						assert(config_q0 == exp) else begin
+							$error("Readback mismatch: %0d instead of %0d", config_q0, exp);
+							$stop;
+						end
+					end
+
+				end while(!ovld || !ordy);
+				ordy <= 0;
+
+				assert(odat == i) else begin
+					$error("Unexpected output: %0d instead of %0d", odat, i);
+					$stop;
+				end
+			end
+
+			while(Q.size) begin
+				automatic logic [DATA_WIDTH-1:0]  exp = Q.pop_front();
+
+				@(posedge clk iff config_rack);
+				assert(config_q0 == exp) else begin
+					$error("Readback mismatch: %0d instead of %0d", config_q0, exp);
+					$stop;
+				end
+			end
+		end
+		ordy <= 0;
+
+		repeat(2) @(posedge clk);
+		$display("Test completed.");
+		$finish;
+	end
+
+	memstream #(
+		.DEPTH(DEPTH),
+		.WIDTH(DATA_WIDTH)
+	) dut (
+		.clk, .rst,
+
+		.config_address,
+		.config_ce,
+		.config_we,
+		.config_d0,
+		.config_q0,
+		.config_rack,
+
+		.ordy,
+		.ovld,
+		.odat
+	);
+
+endmodule : memstream_tb
diff --git a/finn-rtllib/memstream/sim/tb_memstream.v b/finn-rtllib/memstream/sim/tb_memstream.v
deleted file mode 100644
index ad3efad5bd..0000000000
--- a/finn-rtllib/memstream/sim/tb_memstream.v
+++ /dev/null
@@ -1,369 +0,0 @@
-/*
- Copyright (c) 2020, Xilinx
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice, this
-   list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above copyright notice,
-   this list of conditions and the following disclaimer in the documentation
-   and/or other materials provided with the distribution.
-
- * Neither the name of FINN nor the names of its
-   contributors may be used to endorse or promote products derived from
-   this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-`timescale 1ns/10ps
-
-module tb_memstream;
-
-//parameters to enable/disable axi-mm, set number of streams, set readmemh for memory, set per-stream offsets in memory, set per-stream widths
-parameter CONFIG_EN = 1;
-parameter NSTREAMS = 4;//1 up to 6
-
-parameter MEM_DEPTH = 9216;
-parameter MEM_WIDTH = 32;
-parameter MEM_INIT = "./";
-parameter MEM_CHECK = "golden.dat";
-
-//widths per stream
-parameter STRM0_WIDTH = 32;
-parameter STRM1_WIDTH = 32;
-parameter STRM2_WIDTH = 32;
-parameter STRM3_WIDTH = 32;
-parameter STRM4_WIDTH = 1;
-parameter STRM5_WIDTH = 1;
-
-//depths per stream
-parameter STRM0_DEPTH = 2304;
-parameter STRM1_DEPTH = 2304;
-parameter STRM2_DEPTH = 2304;
-parameter STRM3_DEPTH = 2304;
-parameter STRM4_DEPTH = 1;
-parameter STRM5_DEPTH = 1;
-
-//offsets for each stream
-parameter STRM0_OFFSET = 0;
-parameter STRM1_OFFSET = 2304;
-parameter STRM2_OFFSET = 4608;
-parameter STRM3_OFFSET = 6912;
-parameter STRM4_OFFSET = 0;
-parameter STRM5_OFFSET = 0;
-
-
-reg clk;
-reg rst;
-
-reg [31:0] config_address = 0;
-reg config_ce = 0;
-reg config_we = 0;
-reg [31:0] config_d0 = 0;
-wire [31:0] config_q0;
-
-//multiple wire AXI Streams
-reg m_axis_0_afull;
-reg m_axis_0_tready;
-wire m_axis_0_tvalid;
-wire [STRM0_WIDTH-1:0] m_axis_0_tdata;
-
-reg m_axis_1_afull;
-reg m_axis_1_tready;
-wire m_axis_1_tvalid;
-wire [STRM1_WIDTH-1:0] m_axis_1_tdata;
-
-reg m_axis_2_afull;
-reg m_axis_2_tready;
-wire m_axis_2_tvalid;
-wire [STRM2_WIDTH-1:0] m_axis_2_tdata;
-
-reg m_axis_3_afull;
-reg m_axis_3_tready;
-wire m_axis_3_tvalid;
-wire [STRM3_WIDTH-1:0] m_axis_3_tdata;
-
-reg m_axis_4_afull;
-reg m_axis_4_tready;
-wire m_axis_4_tvalid;
-wire [STRM4_WIDTH-1:0] m_axis_4_tdata;
-
-reg m_axis_5_afull;
-reg m_axis_5_tready;
-wire m_axis_5_tvalid;
-wire [STRM5_WIDTH-1:0] m_axis_5_tdata;
-
-reg [MEM_WIDTH-1:0] golden[MEM_DEPTH-1:0];
-integer ptr0, ptr1, ptr2, ptr3, ptr4, ptr5;
-integer done = 0;
-reg [5:0] rng;
-
-//clock
-initial begin
-    clk = 0;
-    forever #5 clk = ~clk;
-end
-
-initial begin
-    rst = 1;
-	config_ce = 0;
-    m_axis_0_afull = 0;
-    m_axis_1_afull = 0;
-    m_axis_2_afull = 0;
-    m_axis_3_afull = 0;
-    m_axis_4_afull = 0;
-    m_axis_5_afull = 0;
-    m_axis_0_tready = 1;
-    m_axis_1_tready = 1;
-    m_axis_2_tready = 1;
-    m_axis_3_tready = 1;
-    m_axis_4_tready = 1;
-    m_axis_5_tready = 1;
-    repeat(100) @(negedge clk);
-    rst = 0;
-    #100
-    fork
-	    begin
-		    $display("Starting to generate random AFULL");
-			while(~done) begin
-			    rng = $random;
-				m_axis_0_afull = rng[0];
-				m_axis_1_afull = rng[1];
-				m_axis_2_afull = rng[2];
-				m_axis_3_afull = rng[3];
-				m_axis_4_afull = rng[4];
-				m_axis_5_afull = rng[5];
-				@(negedge clk);
-			end
-		end
-	join
-end
-
-
-//DUT
-memstream
-#(
-    CONFIG_EN,
-    NSTREAMS,
-    MEM_DEPTH,
-    MEM_WIDTH,
-    MEM_INIT,
-
-    //widths per stream
-    STRM0_WIDTH,
-    STRM1_WIDTH,
-    STRM2_WIDTH,
-    STRM3_WIDTH,
-    STRM4_WIDTH,
-    STRM5_WIDTH,
-
-    //depths per stream
-    STRM0_DEPTH,
-    STRM1_DEPTH,
-    STRM2_DEPTH,
-    STRM3_DEPTH,
-    STRM4_DEPTH,
-    STRM5_DEPTH,
-
-    //offsets for each stream
-    STRM0_OFFSET,
-    STRM1_OFFSET,
-    STRM2_OFFSET,
-    STRM3_OFFSET,
-    STRM4_OFFSET,
-    STRM5_OFFSET
-)
-dut
-(
-    clk,
-    ~rst,
-
-    //optional AXI-Lite interface
-    config_address,
-    config_ce,
-    config_we,
-    config_d0,
-    config_q0,
-
-    //multiple output AXI Streams
-    m_axis_0_afull,
-    m_axis_0_tready,
-    m_axis_0_tvalid,
-    m_axis_0_tdata,
-
-    m_axis_1_afull,
-    m_axis_1_tready,
-    m_axis_1_tvalid,
-    m_axis_1_tdata,
-
-    m_axis_2_afull,
-    m_axis_2_tready,
-    m_axis_2_tvalid,
-    m_axis_2_tdata,
-
-    m_axis_3_afull,
-    m_axis_3_tready,
-    m_axis_3_tvalid,
-    m_axis_3_tdata,
-
-    m_axis_4_afull,
-    m_axis_4_tready,
-    m_axis_4_tvalid,
-    m_axis_4_tdata,
-
-    m_axis_5_afull,
-    m_axis_5_tready,
-    m_axis_5_tvalid,
-    m_axis_5_tdata
-
-
-);
-
-//stream checkers
-initial begin
-    ptr0 = STRM0_OFFSET;
-	ptr1 = STRM1_OFFSET;
-	ptr2 = STRM2_OFFSET;
-	ptr3 = STRM3_OFFSET;
-	ptr4 = STRM4_OFFSET;
-	ptr5 = STRM5_OFFSET;
-    fork
-		//check stream 0
-	    begin
-		    $display("Starting stream 0 checker");
-		    while(~done & (NSTREAMS > 0)) begin
-				@(negedge clk);
-				if(m_axis_0_tvalid) begin
-					if(m_axis_0_tdata != golden[ptr0]) begin
-						$display("Mismatch on stream 0");
-						$stop();
-					end
-					//increment pointer
-					ptr0 = ptr0 + 1;
-					//rewind pointer if it's reached end
-					if(ptr0 == (STRM0_OFFSET + STRM0_DEPTH))
-				    ptr0 = STRM0_OFFSET;
-				end
-			end
-		end
-		//check stream 1
-	    begin
-		    $display("Starting stream 1 checker");
-		    while(~done & (NSTREAMS > 1)) begin
-				@(negedge clk);
-				if(m_axis_1_tvalid) begin
-					if(m_axis_1_tdata != golden[ptr1]) begin
-						$display("Mismatch on stream 1");
-						$stop();
-					end
-					//increment pointer
-					ptr1 = ptr1 + 1;
-					//rewind pointer if it's reached end
-					if(ptr1 == (STRM1_OFFSET + STRM1_DEPTH))
-						ptr1 = STRM1_OFFSET;
-				end
-			end
-		end
-
-		//check stream 2
-	    begin
-		    $display("Starting stream 2 checker");
-		    while(~done & (NSTREAMS > 2)) begin
-				@(negedge clk);
-				if(m_axis_2_tvalid) begin
-					if(m_axis_2_tdata != golden[ptr2]) begin
-						$display("Mismatch on stream 2");
-						$stop();
-					end
-					//increment pointer
-					ptr2 = ptr2 + 1;
-					//rewind pointer if it's reached end
-					if(ptr2 == (STRM2_OFFSET + STRM2_DEPTH))
-						ptr2 = STRM2_OFFSET;
-				end
-			end
-		end
-		//check stream 3
-	    begin
-		    $display("Starting stream 3 checker");
-		    while(~done & (NSTREAMS > 3)) begin
-				@(negedge clk);
-				if(m_axis_3_tvalid) begin
-					if(m_axis_3_tdata != golden[ptr3]) begin
-						$display("Mismatch on stream 3");
-						$stop();
-					end
-					//increment pointer
-					ptr3 = ptr3 + 1;
-					//rewind pointer if it's reached end
-					if(ptr3 == (STRM3_OFFSET + STRM3_DEPTH))
-						ptr3 = STRM3_OFFSET;
-				end
-			end
-		end
-		//check stream 4
-	    begin
-		    $display("Starting stream 4 checker");
-		    while(~done & (NSTREAMS > 4)) begin
-				@(negedge clk);
-				if(m_axis_4_tvalid) begin
-					if(m_axis_4_tdata != golden[ptr4]) begin
-						$display("Mismatch on stream 4");
-						$stop();
-					end
-					//increment pointer
-					ptr4 = ptr4 + 1;
-					//rewind pointer if it's reached end
-					if(ptr4 == (STRM4_OFFSET + STRM4_DEPTH))
-						ptr4 = STRM4_OFFSET;
-				end
-			end
-		end
-		//check stream 5
-	    begin
-		    $display("Starting stream 5 checker");
-		    while(~done & (NSTREAMS > 5)) begin
-				@(negedge clk);
-				if(m_axis_5_tvalid) begin
-					if(m_axis_5_tdata != golden[ptr5]) begin
-						$display("Mismatch on stream 5");
-						$stop();
-					end
-					//increment pointer
-					ptr5 = ptr5 + 1;
-					//rewind pointer if it's reached end
-					if(ptr5 == (STRM5_OFFSET + STRM5_DEPTH))
-						ptr5 = STRM5_OFFSET;
-				end
-			end
-		end
-	join
-end
-
-initial begin
-    done = 0;
-	$readmemh(MEM_CHECK,golden);
-//    $dumpfile("wave.vcd");
-//    $dumpvars(0,tb_memstream);
-    @(negedge rst);
-    #10000000
-	$display("Test done!");
-	done = 1;
-	#1000
-    $finish();
-end
-
-endmodule
diff --git a/finn-rtllib/memstream/sim/tb_memstream_writes.v b/finn-rtllib/memstream/sim/tb_memstream_writes.v
deleted file mode 100644
index c66807454b..0000000000
--- a/finn-rtllib/memstream/sim/tb_memstream_writes.v
+++ /dev/null
@@ -1,486 +0,0 @@
-/*
- Copyright (c) 2020, Xilinx
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice, this
-   list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above copyright notice,
-   this list of conditions and the following disclaimer in the documentation
-   and/or other materials provided with the distribution.
-
- * Neither the name of FINN nor the names of its
-   contributors may be used to endorse or promote products derived from
-   this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-`timescale 1ns/10ps
-
-module tb_memstream_writes;
-
-//parameters to enable/disable axi-mm, set number of streams, set readmemh for memory, set per-stream offsets in memory, set per-stream widths
-parameter CONFIG_EN = 1;
-parameter NSTREAMS = 2;//1 up to 6
-
-parameter MEM_DEPTH = 40;
-parameter MEM_WIDTH = 70;
-
-//widths per stream
-parameter STRM0_WIDTH = 70;
-parameter STRM1_WIDTH = 32;
-parameter STRM2_WIDTH = 32;
-parameter STRM3_WIDTH = 32;
-parameter STRM4_WIDTH = 1;
-parameter STRM5_WIDTH = 1;
-
-//depths per stream
-parameter STRM0_DEPTH = 20;
-parameter STRM1_DEPTH = 20;
-parameter STRM2_DEPTH = 2304;
-parameter STRM3_DEPTH = 2304;
-parameter STRM4_DEPTH = 1;
-parameter STRM5_DEPTH = 1;
-
-//offsets for each stream
-parameter STRM0_OFFSET = 0;
-parameter STRM1_OFFSET = 20;
-parameter STRM2_OFFSET = 4608;
-parameter STRM3_OFFSET = 6912;
-parameter STRM4_OFFSET = 0;
-parameter STRM5_OFFSET = 0;
-
-
-reg clk;
-reg rst;
-
-wire        awready;
-reg         awvalid;
-reg [31:0]  awaddr;
-reg [2:0]   awprot;
-//write data
-wire        wready;
-reg         wvalid;
-reg [31:0]  wdata;
-reg [3:0]   wstrb;
-//burst response
-reg         bready;
-wire        bvalid;
-wire [1:0]  bresp;
-
-//Read channels
-//read address
-wire        arready;
-reg         arvalid;
-reg [31:0]  araddr;
-reg [2:0]   arprot;
-//read data
-reg         rready;
-wire        rvalid;
-wire [1:0]  rresp;
-wire [31:0] rdata;
-
-//multiple wire AXI Streams
-reg m_axis_0_afull;
-reg m_axis_0_tready;
-wire m_axis_0_tvalid;
-wire [STRM0_WIDTH-1:0] m_axis_0_tdata;
-
-reg m_axis_1_afull;
-reg m_axis_1_tready;
-wire m_axis_1_tvalid;
-wire [STRM1_WIDTH-1:0] m_axis_1_tdata;
-
-reg m_axis_2_afull;
-reg m_axis_2_tready;
-wire m_axis_2_tvalid;
-wire [STRM2_WIDTH-1:0] m_axis_2_tdata;
-
-reg m_axis_3_afull;
-reg m_axis_3_tready;
-wire m_axis_3_tvalid;
-wire [STRM3_WIDTH-1:0] m_axis_3_tdata;
-
-reg m_axis_4_afull;
-reg m_axis_4_tready;
-wire m_axis_4_tvalid;
-wire [STRM4_WIDTH-1:0] m_axis_4_tdata;
-
-reg m_axis_5_afull;
-reg m_axis_5_tready;
-wire m_axis_5_tvalid;
-wire [STRM5_WIDTH-1:0] m_axis_5_tdata;
-
-reg [MEM_WIDTH-1:0] golden[MEM_DEPTH-1:0];
-reg [MEM_WIDTH-1:0] gword;
-integer ptr0, ptr1, ptr2, ptr3, ptr4, ptr5;
-integer done = 0;
-integer i, j;
-reg [5:0] rng;
-
-parameter NFOLDS_PER_WORD = (MEM_WIDTH+31)/32;
-
-task axi_write;
-    input [MEM_WIDTH-1:0] data;
-    input [31:0] adr;
-    begin
-        for(j=0; j<(1<<$clog2(NFOLDS_PER_WORD)); j=j+1) begin
-            @(negedge clk);
-            awvalid = 1;
-            wvalid = 1;
-            wdata = data>>(j*32);
-            awaddr = (adr*(1<<$clog2(NFOLDS_PER_WORD))+j)*4;
-            fork
-                begin
-                    @(posedge awready);
-                    @(posedge clk) awvalid = 0;
-                end
-                begin
-                    @(posedge wready);
-                    @(posedge clk) wvalid = 0;
-                end
-            join
-            @(posedge clk);
-        end
-    end
-endtask
-
-task axi_read;
-    input [31:0] adr;
-    output [MEM_WIDTH-1:0] data;
-    begin
-        data = 0;
-        for(j=0; j<NFOLDS_PER_WORD; j=j+1) begin
-            @(negedge clk);
-            arvalid = 1;
-            araddr = (adr*(1<<$clog2(NFOLDS_PER_WORD))+j)*4;
-            rready = 1;
-            fork
-                begin
-                    @(posedge arready);
-                    @(posedge clk) arvalid = 0;
-                end
-                begin
-                    @(posedge rvalid);
-                    @(posedge clk) rready = 0;
-                    data = data | (rdata<<(32*j));
-                end
-            join
-            @(posedge clk);
-        end
-    end
-endtask
-
-//clock
-initial begin
-    clk = 0;
-    forever #5 clk = ~clk;
-end
-
-initial begin
-    rst = 1;
-    awvalid = 0;
-    arvalid = 0;
-    wvalid = 0;
-    rready = 1;
-    bready = 1;
-    m_axis_0_afull = 1;
-    m_axis_1_afull = 1;
-    m_axis_2_afull = 1;
-    m_axis_3_afull = 1;
-    m_axis_4_afull = 1;
-    m_axis_5_afull = 1;
-    m_axis_0_tready = 0;
-    m_axis_1_tready = 0;
-    m_axis_2_tready = 0;
-    m_axis_3_tready = 0;
-    m_axis_4_tready = 0;
-    m_axis_5_tready = 0;
-    repeat(100) @(negedge clk);
-    rst = 0;
-    #100
-    //random initialization of golden data
-    for(i=0; i<MEM_DEPTH; i=i+1) begin
-        gword = 0;
-        repeat(NFOLDS_PER_WORD)
-            gword = (gword << 32) | $random;
-        golden[i] = gword;
-        axi_write(golden[i],i);
-        axi_read(i,gword);
-    end
-    //re-reset
-    repeat(100) @(negedge clk);
-    rst = 1;
-    #100
-    repeat(100) @(negedge clk);
-    rst = 0;
-    #100
-    @(negedge clk);
-    //start reads
-    m_axis_0_afull = 0;
-    m_axis_1_afull = 0;
-    m_axis_2_afull = 0;
-    m_axis_3_afull = 0;
-    m_axis_4_afull = 0;
-    m_axis_5_afull = 0;
-    m_axis_0_tready = 1;
-    m_axis_1_tready = 1;
-    m_axis_2_tready = 1;
-    m_axis_3_tready = 1;
-    m_axis_4_tready = 1;
-    m_axis_5_tready = 1;
-    fork
-	    begin
-		    $display("Starting to generate random AFULL");
-			while(~done) begin
-			    rng = $random;
-				m_axis_0_afull = rng[0];
-				m_axis_1_afull = rng[1];
-				m_axis_2_afull = rng[2];
-				m_axis_3_afull = rng[3];
-				m_axis_4_afull = rng[4];
-				m_axis_5_afull = rng[5];
-				@(negedge clk);
-			end
-		end
-	join
-end
-
-
-//DUT
-memstream
-#(
-    CONFIG_EN,
-    NSTREAMS,
-    MEM_DEPTH,
-    MEM_WIDTH,
-    ".",
-    "auto",
-    //widths per stream
-    STRM0_WIDTH,
-    STRM1_WIDTH,
-    STRM2_WIDTH,
-    STRM3_WIDTH,
-    STRM4_WIDTH,
-    STRM5_WIDTH,
-    //depths per stream
-    STRM0_DEPTH,
-    STRM1_DEPTH,
-    STRM2_DEPTH,
-    STRM3_DEPTH,
-    STRM4_DEPTH,
-    STRM5_DEPTH,
-    //offsets for each stream
-    STRM0_OFFSET,
-    STRM1_OFFSET,
-    STRM2_OFFSET,
-    STRM3_OFFSET,
-    STRM4_OFFSET,
-    STRM5_OFFSET
-)
-dut
-(
-    clk,
-    ~rst,
-
-    //optional AXI-Lite interface
-    awready,
-    awvalid,
-    awaddr,
-    awprot,
-    //write data
-    wready,
-    wvalid,
-    wdata,
-    wstrb,
-    //burst response
-    bready,
-    bvalid,
-    bresp,
-
-    //Read channels
-    //read address
-    arready,
-    arvalid,
-    araddr,
-    arprot,
-    //read data
-    rready,
-    rvalid,
-    rresp,
-    rdata,
-
-    //multiple output AXI Streams
-    m_axis_0_afull,
-    m_axis_0_tready,
-    m_axis_0_tvalid,
-    m_axis_0_tdata,
-    m_axis_1_afull,
-    m_axis_1_tready,
-    m_axis_1_tvalid,
-    m_axis_1_tdata,
-    m_axis_2_afull,
-    m_axis_2_tready,
-    m_axis_2_tvalid,
-    m_axis_2_tdata,
-    m_axis_3_afull,
-    m_axis_3_tready,
-    m_axis_3_tvalid,
-    m_axis_3_tdata,
-    m_axis_4_afull,
-    m_axis_4_tready,
-    m_axis_4_tvalid,
-    m_axis_4_tdata,
-    m_axis_5_afull,
-    m_axis_5_tready,
-    m_axis_5_tvalid,
-    m_axis_5_tdata
-
-);
-
-//stream checkers
-initial begin
-    ptr0 = STRM0_OFFSET;
-	ptr1 = STRM1_OFFSET;
-	ptr2 = STRM2_OFFSET;
-	ptr3 = STRM3_OFFSET;
-	ptr4 = STRM4_OFFSET;
-	ptr5 = STRM5_OFFSET;
-    fork
-		//check stream 0
-	    begin
-		    $display("Starting stream 0 checker");
-		    while(~done & (NSTREAMS > 0)) begin
-				@(negedge clk);
-				if(m_axis_0_tvalid & m_axis_0_tready) begin
-					if(m_axis_0_tdata != golden[ptr0]) begin
-						$display("Mismatch on stream 0");
-						$stop();
-					end
-					//increment pointer
-					ptr0 = ptr0 + 1;
-					//rewind pointer if it's reached end
-					if(ptr0 == (STRM0_OFFSET + STRM0_DEPTH))
-				        ptr0 = STRM0_OFFSET;
-				end
-			end
-		end
-		//check stream 1
-	    begin
-		    $display("Starting stream 1 checker");
-		    while(~done & (NSTREAMS > 1)) begin
-				@(negedge clk);
-				if(m_axis_1_tvalid & m_axis_1_tready) begin
-					if(m_axis_1_tdata != golden[ptr1]) begin
-						$display("Mismatch on stream 1");
-						$stop();
-					end
-					//increment pointer
-					ptr1 = ptr1 + 1;
-					//rewind pointer if it's reached end
-					if(ptr1 == (STRM1_OFFSET + STRM1_DEPTH))
-						ptr1 = STRM1_OFFSET;
-				end
-			end
-		end
-		//check stream 2
-	    begin
-		    $display("Starting stream 2 checker");
-		    while(~done & (NSTREAMS > 2)) begin
-				@(negedge clk);
-				if(m_axis_2_tvalid & m_axis_2_tready) begin
-					if(m_axis_2_tdata != golden[ptr2]) begin
-						$display("Mismatch on stream 2");
-						$stop();
-					end
-					//increment pointer
-					ptr2 = ptr2 + 1;
-					//rewind pointer if it's reached end
-					if(ptr2 == (STRM2_OFFSET + STRM2_DEPTH))
-						ptr2 = STRM2_OFFSET;
-				end
-			end
-		end
-		//check stream 3
-	    begin
-		    $display("Starting stream 3 checker");
-		    while(~done & (NSTREAMS > 3)) begin
-				@(negedge clk);
-				if(m_axis_3_tvalid & m_axis_3_tready) begin
-					if(m_axis_3_tdata != golden[ptr3]) begin
-						$display("Mismatch on stream 3");
-						$stop();
-					end
-					//increment pointer
-					ptr3 = ptr3 + 1;
-					//rewind pointer if it's reached end
-					if(ptr3 == (STRM3_OFFSET + STRM3_DEPTH))
-						ptr3 = STRM3_OFFSET;
-				end
-			end
-		end
-		//check stream 4
-	    begin
-		    $display("Starting stream 4 checker");
-		    while(~done & (NSTREAMS > 4)) begin
-				@(negedge clk);
-				if(m_axis_4_tvalid & m_axis_4_tready) begin
-					if(m_axis_4_tdata != golden[ptr4]) begin
-						$display("Mismatch on stream 4");
-						$stop();
-					end
-					//increment pointer
-					ptr4 = ptr4 + 1;
-					//rewind pointer if it's reached end
-					if(ptr4 == (STRM4_OFFSET + STRM4_DEPTH))
-						ptr4 = STRM4_OFFSET;
-				end
-			end
-		end
-		//check stream 5
-	    begin
-		    $display("Starting stream 5 checker");
-		    while(~done & (NSTREAMS > 5)) begin
-				@(negedge clk);
-				if(m_axis_5_tvalid & m_axis_5_tready) begin
-					if(m_axis_5_tdata != golden[ptr5]) begin
-						$display("Mismatch on stream 5");
-						$stop();
-					end
-					//increment pointer
-					ptr5 = ptr5 + 1;
-					//rewind pointer if it's reached end
-					if(ptr5 == (STRM5_OFFSET + STRM5_DEPTH))
-						ptr5 = STRM5_OFFSET;
-				end
-			end
-		end
-	join
-end
-
-initial begin
-    done = 0;
-    @(negedge rst);
-    $dumpfile("wave.vcd");
-    $dumpvars(0,tb_memstream_writes);
-    #50000
-	$display("Test done!");
-	done = 1;
-	#1000
-    $finish();
-end
-
-endmodule
diff --git a/finn-rtllib/memstream/sim/test.sh b/finn-rtllib/memstream/sim/test.sh
deleted file mode 100755
index 7cb0497d26..0000000000
--- a/finn-rtllib/memstream/sim/test.sh
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/bin/bash
-
-# Copyright (c) 2020, Xilinx
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice, this
-#   list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the following disclaimer in the documentation
-#   and/or other materials provided with the distribution.
-#
-# * Neither the name of FINN nor the names of its
-#   contributors may be used to endorse or promote products derived from
-#   this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-iverilog ../hdl/*.v tb_memstream_writes.v -o sim
-./sim
diff --git a/finn-rtllib/memstream/xgui/memstream_v1_0.tcl b/finn-rtllib/memstream/xgui/memstream_v1_0.tcl
index 87565bc561..e802d81c79 100644
--- a/finn-rtllib/memstream/xgui/memstream_v1_0.tcl
+++ b/finn-rtllib/memstream/xgui/memstream_v1_0.tcl
@@ -8,42 +8,21 @@ proc init_gui { IPINST } {
   #Adding Page
   set Page_0 [ipgui::add_page $IPINST -name "Page 0"]
   ipgui::add_param $IPINST -name "AXILITE_ADDR_WIDTH" -parent ${Page_0}
-  ipgui::add_param $IPINST -name "CONFIG_EN" -parent ${Page_0}
-  ipgui::add_param $IPINST -name "MEM_DEPTH" -parent ${Page_0}
-  ipgui::add_param $IPINST -name "MEM_INIT" -parent ${Page_0}
-  ipgui::add_param $IPINST -name "MEM_WIDTH" -parent ${Page_0}
-  ipgui::add_param $IPINST -name "NSTREAMS" -parent ${Page_0}
-  ipgui::add_param $IPINST -name "RAM_STYLE" -parent ${Page_0} -widget comboBox
-  ipgui::add_param $IPINST -name "STRM0_DEPTH" -parent ${Page_0}
-  ipgui::add_param $IPINST -name "STRM0_OFFSET" -parent ${Page_0}
-  ipgui::add_param $IPINST -name "STRM0_WIDTH" -parent ${Page_0}
-  ipgui::add_param $IPINST -name "STRM1_DEPTH" -parent ${Page_0}
-  ipgui::add_param $IPINST -name "STRM1_OFFSET" -parent ${Page_0}
-  ipgui::add_param $IPINST -name "STRM1_WIDTH" -parent ${Page_0}
-  ipgui::add_param $IPINST -name "STRM2_DEPTH" -parent ${Page_0}
-  ipgui::add_param $IPINST -name "STRM2_OFFSET" -parent ${Page_0}
-  ipgui::add_param $IPINST -name "STRM2_WIDTH" -parent ${Page_0}
-  ipgui::add_param $IPINST -name "STRM3_DEPTH" -parent ${Page_0}
-  ipgui::add_param $IPINST -name "STRM3_OFFSET" -parent ${Page_0}
-  ipgui::add_param $IPINST -name "STRM3_WIDTH" -parent ${Page_0}
-  ipgui::add_param $IPINST -name "STRM4_DEPTH" -parent ${Page_0}
-  ipgui::add_param $IPINST -name "STRM4_OFFSET" -parent ${Page_0}
-  ipgui::add_param $IPINST -name "STRM4_WIDTH" -parent ${Page_0}
-  ipgui::add_param $IPINST -name "STRM5_DEPTH" -parent ${Page_0}
-  ipgui::add_param $IPINST -name "STRM5_OFFSET" -parent ${Page_0}
-  ipgui::add_param $IPINST -name "STRM5_WIDTH" -parent ${Page_0}
-
-
+  ipgui::add_param $IPINST -name "DEPTH" -parent ${Page_0}
+  ipgui::add_param $IPINST -name "INIT_FILE" -parent ${Page_0}
+  ipgui::add_param $IPINST -name "RAM_STYLE" -parent ${Page_0}
+  ipgui::add_param $IPINST -name "WIDTH" -parent ${Page_0}
 }
 
-proc update_PARAM_VALUE.AXILITE_ADDR_WIDTH { PARAM_VALUE.AXILITE_ADDR_WIDTH PARAM_VALUE.MEM_DEPTH PARAM_VALUE.MEM_WIDTH } {
+proc update_PARAM_VALUE.AXILITE_ADDR_WIDTH { PARAM_VALUE.AXILITE_ADDR_WIDTH PARAM_VALUE.DEPTH PARAM_VALUE.WIDTH } {
 	# Procedure called to update AXILITE_ADDR_WIDTH when any of the dependent parameters in the arguments change
+
 	set AXILITE_ADDR_WIDTH ${PARAM_VALUE.AXILITE_ADDR_WIDTH}
-	set MEM_DEPTH ${PARAM_VALUE.MEM_DEPTH}
-	set MEM_WIDTH ${PARAM_VALUE.MEM_WIDTH}
-	set values(MEM_DEPTH) [get_property value $MEM_DEPTH]
-	set values(MEM_WIDTH) [get_property value $MEM_WIDTH]
-	set_property value [gen_USERPARAMETER_AXILITE_ADDR_WIDTH_VALUE $values(MEM_DEPTH) $values(MEM_WIDTH)] $AXILITE_ADDR_WIDTH
+	set DEPTH ${PARAM_VALUE.DEPTH}
+	set WIDTH ${PARAM_VALUE.WIDTH}
+	set values(DEPTH) [get_property value $DEPTH]
+	set values(WIDTH) [get_property value $WIDTH]
+	set_property value [gen_USERPARAMETER_AXILITE_ADDR_WIDTH_VALUE $values(DEPTH) $values(WIDTH)] $AXILITE_ADDR_WIDTH
 }
 
 proc validate_PARAM_VALUE.AXILITE_ADDR_WIDTH { PARAM_VALUE.AXILITE_ADDR_WIDTH } {
@@ -51,48 +30,21 @@ proc validate_PARAM_VALUE.AXILITE_ADDR_WIDTH { PARAM_VALUE.AXILITE_ADDR_WIDTH }
 	return true
 }
 
-proc update_PARAM_VALUE.CONFIG_EN { PARAM_VALUE.CONFIG_EN } {
-	# Procedure called to update CONFIG_EN when any of the dependent parameters in the arguments change
+proc update_PARAM_VALUE.DEPTH { PARAM_VALUE.DEPTH } {
+	# Procedure called to update DEPTH when any of the dependent parameters in the arguments change
 }
 
-proc validate_PARAM_VALUE.CONFIG_EN { PARAM_VALUE.CONFIG_EN } {
-	# Procedure called to validate CONFIG_EN
+proc validate_PARAM_VALUE.DEPTH { PARAM_VALUE.DEPTH } {
+	# Procedure called to validate DEPTH
 	return true
 }
 
-proc update_PARAM_VALUE.MEM_DEPTH { PARAM_VALUE.MEM_DEPTH } {
-	# Procedure called to update MEM_DEPTH when any of the dependent parameters in the arguments change
+proc update_PARAM_VALUE.INIT_FILE { PARAM_VALUE.INIT_FILE } {
+	# Procedure called to update INIT_FILE when any of the dependent parameters in the arguments change
 }
 
-proc validate_PARAM_VALUE.MEM_DEPTH { PARAM_VALUE.MEM_DEPTH } {
-	# Procedure called to validate MEM_DEPTH
-	return true
-}
-
-proc update_PARAM_VALUE.MEM_INIT { PARAM_VALUE.MEM_INIT } {
-	# Procedure called to update MEM_INIT when any of the dependent parameters in the arguments change
-}
-
-proc validate_PARAM_VALUE.MEM_INIT { PARAM_VALUE.MEM_INIT } {
-	# Procedure called to validate MEM_INIT
-	return true
-}
-
-proc update_PARAM_VALUE.MEM_WIDTH { PARAM_VALUE.MEM_WIDTH } {
-	# Procedure called to update MEM_WIDTH when any of the dependent parameters in the arguments change
-}
-
-proc validate_PARAM_VALUE.MEM_WIDTH { PARAM_VALUE.MEM_WIDTH } {
-	# Procedure called to validate MEM_WIDTH
-	return true
-}
-
-proc update_PARAM_VALUE.NSTREAMS { PARAM_VALUE.NSTREAMS } {
-	# Procedure called to update NSTREAMS when any of the dependent parameters in the arguments change
-}
-
-proc validate_PARAM_VALUE.NSTREAMS { PARAM_VALUE.NSTREAMS } {
-	# Procedure called to validate NSTREAMS
+proc validate_PARAM_VALUE.INIT_FILE { PARAM_VALUE.INIT_FILE } {
+	# Procedure called to validate INIT_FILE
 	return true
 }
 
@@ -105,192 +57,29 @@ proc validate_PARAM_VALUE.RAM_STYLE { PARAM_VALUE.RAM_STYLE } {
 	return true
 }
 
-proc update_PARAM_VALUE.STRM0_DEPTH { PARAM_VALUE.STRM0_DEPTH } {
-	# Procedure called to update STRM0_DEPTH when any of the dependent parameters in the arguments change
-}
-
-proc validate_PARAM_VALUE.STRM0_DEPTH { PARAM_VALUE.STRM0_DEPTH } {
-	# Procedure called to validate STRM0_DEPTH
-	return true
-}
-
-proc update_PARAM_VALUE.STRM0_OFFSET { PARAM_VALUE.STRM0_OFFSET } {
-	# Procedure called to update STRM0_OFFSET when any of the dependent parameters in the arguments change
-}
-
-proc validate_PARAM_VALUE.STRM0_OFFSET { PARAM_VALUE.STRM0_OFFSET } {
-	# Procedure called to validate STRM0_OFFSET
-	return true
-}
-
-proc update_PARAM_VALUE.STRM0_WIDTH { PARAM_VALUE.STRM0_WIDTH } {
-	# Procedure called to update STRM0_WIDTH when any of the dependent parameters in the arguments change
-}
-
-proc validate_PARAM_VALUE.STRM0_WIDTH { PARAM_VALUE.STRM0_WIDTH } {
-	# Procedure called to validate STRM0_WIDTH
-	return true
-}
-
-proc update_PARAM_VALUE.STRM1_DEPTH { PARAM_VALUE.STRM1_DEPTH } {
-	# Procedure called to update STRM1_DEPTH when any of the dependent parameters in the arguments change
-}
-
-proc validate_PARAM_VALUE.STRM1_DEPTH { PARAM_VALUE.STRM1_DEPTH } {
-	# Procedure called to validate STRM1_DEPTH
-	return true
-}
-
-proc update_PARAM_VALUE.STRM1_OFFSET { PARAM_VALUE.STRM1_OFFSET } {
-	# Procedure called to update STRM1_OFFSET when any of the dependent parameters in the arguments change
-}
-
-proc validate_PARAM_VALUE.STRM1_OFFSET { PARAM_VALUE.STRM1_OFFSET } {
-	# Procedure called to validate STRM1_OFFSET
-	return true
-}
-
-proc update_PARAM_VALUE.STRM1_WIDTH { PARAM_VALUE.STRM1_WIDTH } {
-	# Procedure called to update STRM1_WIDTH when any of the dependent parameters in the arguments change
+proc update_PARAM_VALUE.WIDTH { PARAM_VALUE.WIDTH } {
+	# Procedure called to update WIDTH when any of the dependent parameters in the arguments change
 }
 
-proc validate_PARAM_VALUE.STRM1_WIDTH { PARAM_VALUE.STRM1_WIDTH } {
-	# Procedure called to validate STRM1_WIDTH
+proc validate_PARAM_VALUE.WIDTH { PARAM_VALUE.WIDTH } {
+	# Procedure called to validate WIDTH
 	return true
 }
 
-proc update_PARAM_VALUE.STRM2_DEPTH { PARAM_VALUE.STRM2_DEPTH } {
-	# Procedure called to update STRM2_DEPTH when any of the dependent parameters in the arguments change
-}
-
-proc validate_PARAM_VALUE.STRM2_DEPTH { PARAM_VALUE.STRM2_DEPTH } {
-	# Procedure called to validate STRM2_DEPTH
-	return true
-}
-
-proc update_PARAM_VALUE.STRM2_OFFSET { PARAM_VALUE.STRM2_OFFSET } {
-	# Procedure called to update STRM2_OFFSET when any of the dependent parameters in the arguments change
-}
-
-proc validate_PARAM_VALUE.STRM2_OFFSET { PARAM_VALUE.STRM2_OFFSET } {
-	# Procedure called to validate STRM2_OFFSET
-	return true
-}
-
-proc update_PARAM_VALUE.STRM2_WIDTH { PARAM_VALUE.STRM2_WIDTH } {
-	# Procedure called to update STRM2_WIDTH when any of the dependent parameters in the arguments change
-}
-
-proc validate_PARAM_VALUE.STRM2_WIDTH { PARAM_VALUE.STRM2_WIDTH } {
-	# Procedure called to validate STRM2_WIDTH
-	return true
-}
-
-proc update_PARAM_VALUE.STRM3_DEPTH { PARAM_VALUE.STRM3_DEPTH } {
-	# Procedure called to update STRM3_DEPTH when any of the dependent parameters in the arguments change
-}
-
-proc validate_PARAM_VALUE.STRM3_DEPTH { PARAM_VALUE.STRM3_DEPTH } {
-	# Procedure called to validate STRM3_DEPTH
-	return true
-}
-
-proc update_PARAM_VALUE.STRM3_OFFSET { PARAM_VALUE.STRM3_OFFSET } {
-	# Procedure called to update STRM3_OFFSET when any of the dependent parameters in the arguments change
-}
-
-proc validate_PARAM_VALUE.STRM3_OFFSET { PARAM_VALUE.STRM3_OFFSET } {
-	# Procedure called to validate STRM3_OFFSET
-	return true
-}
-
-proc update_PARAM_VALUE.STRM3_WIDTH { PARAM_VALUE.STRM3_WIDTH } {
-	# Procedure called to update STRM3_WIDTH when any of the dependent parameters in the arguments change
-}
-
-proc validate_PARAM_VALUE.STRM3_WIDTH { PARAM_VALUE.STRM3_WIDTH } {
-	# Procedure called to validate STRM3_WIDTH
-	return true
-}
-
-proc update_PARAM_VALUE.STRM4_DEPTH { PARAM_VALUE.STRM4_DEPTH } {
-	# Procedure called to update STRM4_DEPTH when any of the dependent parameters in the arguments change
-}
-
-proc validate_PARAM_VALUE.STRM4_DEPTH { PARAM_VALUE.STRM4_DEPTH } {
-	# Procedure called to validate STRM4_DEPTH
-	return true
-}
-
-proc update_PARAM_VALUE.STRM4_OFFSET { PARAM_VALUE.STRM4_OFFSET } {
-	# Procedure called to update STRM4_OFFSET when any of the dependent parameters in the arguments change
-}
-
-proc validate_PARAM_VALUE.STRM4_OFFSET { PARAM_VALUE.STRM4_OFFSET } {
-	# Procedure called to validate STRM4_OFFSET
-	return true
-}
 
-proc update_PARAM_VALUE.STRM4_WIDTH { PARAM_VALUE.STRM4_WIDTH } {
-	# Procedure called to update STRM4_WIDTH when any of the dependent parameters in the arguments change
-}
-
-proc validate_PARAM_VALUE.STRM4_WIDTH { PARAM_VALUE.STRM4_WIDTH } {
-	# Procedure called to validate STRM4_WIDTH
-	return true
-}
-
-proc update_PARAM_VALUE.STRM5_DEPTH { PARAM_VALUE.STRM5_DEPTH } {
-	# Procedure called to update STRM5_DEPTH when any of the dependent parameters in the arguments change
-}
-
-proc validate_PARAM_VALUE.STRM5_DEPTH { PARAM_VALUE.STRM5_DEPTH } {
-	# Procedure called to validate STRM5_DEPTH
-	return true
-}
-
-proc update_PARAM_VALUE.STRM5_OFFSET { PARAM_VALUE.STRM5_OFFSET } {
-	# Procedure called to update STRM5_OFFSET when any of the dependent parameters in the arguments change
-}
-
-proc validate_PARAM_VALUE.STRM5_OFFSET { PARAM_VALUE.STRM5_OFFSET } {
-	# Procedure called to validate STRM5_OFFSET
-	return true
-}
-
-proc update_PARAM_VALUE.STRM5_WIDTH { PARAM_VALUE.STRM5_WIDTH } {
-	# Procedure called to update STRM5_WIDTH when any of the dependent parameters in the arguments change
-}
-
-proc validate_PARAM_VALUE.STRM5_WIDTH { PARAM_VALUE.STRM5_WIDTH } {
-	# Procedure called to validate STRM5_WIDTH
-	return true
-}
-
-
-proc update_MODELPARAM_VALUE.CONFIG_EN { MODELPARAM_VALUE.CONFIG_EN PARAM_VALUE.CONFIG_EN } {
+proc update_MODELPARAM_VALUE.DEPTH { MODELPARAM_VALUE.DEPTH PARAM_VALUE.DEPTH } {
 	# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
-	set_property value [get_property value ${PARAM_VALUE.CONFIG_EN}] ${MODELPARAM_VALUE.CONFIG_EN}
+	set_property value [get_property value ${PARAM_VALUE.DEPTH}] ${MODELPARAM_VALUE.DEPTH}
 }
 
-proc update_MODELPARAM_VALUE.NSTREAMS { MODELPARAM_VALUE.NSTREAMS PARAM_VALUE.NSTREAMS } {
+proc update_MODELPARAM_VALUE.WIDTH { MODELPARAM_VALUE.WIDTH PARAM_VALUE.WIDTH } {
 	# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
-	set_property value [get_property value ${PARAM_VALUE.NSTREAMS}] ${MODELPARAM_VALUE.NSTREAMS}
+	set_property value [get_property value ${PARAM_VALUE.WIDTH}] ${MODELPARAM_VALUE.WIDTH}
 }
 
-proc update_MODELPARAM_VALUE.MEM_DEPTH { MODELPARAM_VALUE.MEM_DEPTH PARAM_VALUE.MEM_DEPTH } {
+proc update_MODELPARAM_VALUE.INIT_FILE { MODELPARAM_VALUE.INIT_FILE PARAM_VALUE.INIT_FILE } {
 	# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
-	set_property value [get_property value ${PARAM_VALUE.MEM_DEPTH}] ${MODELPARAM_VALUE.MEM_DEPTH}
-}
-
-proc update_MODELPARAM_VALUE.MEM_WIDTH { MODELPARAM_VALUE.MEM_WIDTH PARAM_VALUE.MEM_WIDTH } {
-	# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
-	set_property value [get_property value ${PARAM_VALUE.MEM_WIDTH}] ${MODELPARAM_VALUE.MEM_WIDTH}
-}
-
-proc update_MODELPARAM_VALUE.MEM_INIT { MODELPARAM_VALUE.MEM_INIT PARAM_VALUE.MEM_INIT } {
-	# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
-	set_property value [get_property value ${PARAM_VALUE.MEM_INIT}] ${MODELPARAM_VALUE.MEM_INIT}
+	set_property value [get_property value ${PARAM_VALUE.INIT_FILE}] ${MODELPARAM_VALUE.INIT_FILE}
 }
 
 proc update_MODELPARAM_VALUE.RAM_STYLE { MODELPARAM_VALUE.RAM_STYLE PARAM_VALUE.RAM_STYLE } {
@@ -298,96 +87,6 @@ proc update_MODELPARAM_VALUE.RAM_STYLE { MODELPARAM_VALUE.RAM_STYLE PARAM_VALUE.
 	set_property value [get_property value ${PARAM_VALUE.RAM_STYLE}] ${MODELPARAM_VALUE.RAM_STYLE}
 }
 
-proc update_MODELPARAM_VALUE.STRM0_WIDTH { MODELPARAM_VALUE.STRM0_WIDTH PARAM_VALUE.STRM0_WIDTH } {
-	# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
-	set_property value [get_property value ${PARAM_VALUE.STRM0_WIDTH}] ${MODELPARAM_VALUE.STRM0_WIDTH}
-}
-
-proc update_MODELPARAM_VALUE.STRM1_WIDTH { MODELPARAM_VALUE.STRM1_WIDTH PARAM_VALUE.STRM1_WIDTH } {
-	# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
-	set_property value [get_property value ${PARAM_VALUE.STRM1_WIDTH}] ${MODELPARAM_VALUE.STRM1_WIDTH}
-}
-
-proc update_MODELPARAM_VALUE.STRM2_WIDTH { MODELPARAM_VALUE.STRM2_WIDTH PARAM_VALUE.STRM2_WIDTH } {
-	# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
-	set_property value [get_property value ${PARAM_VALUE.STRM2_WIDTH}] ${MODELPARAM_VALUE.STRM2_WIDTH}
-}
-
-proc update_MODELPARAM_VALUE.STRM3_WIDTH { MODELPARAM_VALUE.STRM3_WIDTH PARAM_VALUE.STRM3_WIDTH } {
-	# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
-	set_property value [get_property value ${PARAM_VALUE.STRM3_WIDTH}] ${MODELPARAM_VALUE.STRM3_WIDTH}
-}
-
-proc update_MODELPARAM_VALUE.STRM4_WIDTH { MODELPARAM_VALUE.STRM4_WIDTH PARAM_VALUE.STRM4_WIDTH } {
-	# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
-	set_property value [get_property value ${PARAM_VALUE.STRM4_WIDTH}] ${MODELPARAM_VALUE.STRM4_WIDTH}
-}
-
-proc update_MODELPARAM_VALUE.STRM5_WIDTH { MODELPARAM_VALUE.STRM5_WIDTH PARAM_VALUE.STRM5_WIDTH } {
-	# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
-	set_property value [get_property value ${PARAM_VALUE.STRM5_WIDTH}] ${MODELPARAM_VALUE.STRM5_WIDTH}
-}
-
-proc update_MODELPARAM_VALUE.STRM0_DEPTH { MODELPARAM_VALUE.STRM0_DEPTH PARAM_VALUE.STRM0_DEPTH } {
-	# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
-	set_property value [get_property value ${PARAM_VALUE.STRM0_DEPTH}] ${MODELPARAM_VALUE.STRM0_DEPTH}
-}
-
-proc update_MODELPARAM_VALUE.STRM1_DEPTH { MODELPARAM_VALUE.STRM1_DEPTH PARAM_VALUE.STRM1_DEPTH } {
-	# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
-	set_property value [get_property value ${PARAM_VALUE.STRM1_DEPTH}] ${MODELPARAM_VALUE.STRM1_DEPTH}
-}
-
-proc update_MODELPARAM_VALUE.STRM2_DEPTH { MODELPARAM_VALUE.STRM2_DEPTH PARAM_VALUE.STRM2_DEPTH } {
-	# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
-	set_property value [get_property value ${PARAM_VALUE.STRM2_DEPTH}] ${MODELPARAM_VALUE.STRM2_DEPTH}
-}
-
-proc update_MODELPARAM_VALUE.STRM3_DEPTH { MODELPARAM_VALUE.STRM3_DEPTH PARAM_VALUE.STRM3_DEPTH } {
-	# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
-	set_property value [get_property value ${PARAM_VALUE.STRM3_DEPTH}] ${MODELPARAM_VALUE.STRM3_DEPTH}
-}
-
-proc update_MODELPARAM_VALUE.STRM4_DEPTH { MODELPARAM_VALUE.STRM4_DEPTH PARAM_VALUE.STRM4_DEPTH } {
-	# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
-	set_property value [get_property value ${PARAM_VALUE.STRM4_DEPTH}] ${MODELPARAM_VALUE.STRM4_DEPTH}
-}
-
-proc update_MODELPARAM_VALUE.STRM5_DEPTH { MODELPARAM_VALUE.STRM5_DEPTH PARAM_VALUE.STRM5_DEPTH } {
-	# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
-	set_property value [get_property value ${PARAM_VALUE.STRM5_DEPTH}] ${MODELPARAM_VALUE.STRM5_DEPTH}
-}
-
-proc update_MODELPARAM_VALUE.STRM0_OFFSET { MODELPARAM_VALUE.STRM0_OFFSET PARAM_VALUE.STRM0_OFFSET } {
-	# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
-	set_property value [get_property value ${PARAM_VALUE.STRM0_OFFSET}] ${MODELPARAM_VALUE.STRM0_OFFSET}
-}
-
-proc update_MODELPARAM_VALUE.STRM1_OFFSET { MODELPARAM_VALUE.STRM1_OFFSET PARAM_VALUE.STRM1_OFFSET } {
-	# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
-	set_property value [get_property value ${PARAM_VALUE.STRM1_OFFSET}] ${MODELPARAM_VALUE.STRM1_OFFSET}
-}
-
-proc update_MODELPARAM_VALUE.STRM2_OFFSET { MODELPARAM_VALUE.STRM2_OFFSET PARAM_VALUE.STRM2_OFFSET } {
-	# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
-	set_property value [get_property value ${PARAM_VALUE.STRM2_OFFSET}] ${MODELPARAM_VALUE.STRM2_OFFSET}
-}
-
-proc update_MODELPARAM_VALUE.STRM3_OFFSET { MODELPARAM_VALUE.STRM3_OFFSET PARAM_VALUE.STRM3_OFFSET } {
-	# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
-	set_property value [get_property value ${PARAM_VALUE.STRM3_OFFSET}] ${MODELPARAM_VALUE.STRM3_OFFSET}
-}
-
-proc update_MODELPARAM_VALUE.STRM4_OFFSET { MODELPARAM_VALUE.STRM4_OFFSET PARAM_VALUE.STRM4_OFFSET } {
-	# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
-	set_property value [get_property value ${PARAM_VALUE.STRM4_OFFSET}] ${MODELPARAM_VALUE.STRM4_OFFSET}
-}
-
-proc update_MODELPARAM_VALUE.STRM5_OFFSET { MODELPARAM_VALUE.STRM5_OFFSET PARAM_VALUE.STRM5_OFFSET } {
-	# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
-	set_property value [get_property value ${PARAM_VALUE.STRM5_OFFSET}] ${MODELPARAM_VALUE.STRM5_OFFSET}
-}
-
 proc update_MODELPARAM_VALUE.AXILITE_ADDR_WIDTH { MODELPARAM_VALUE.AXILITE_ADDR_WIDTH PARAM_VALUE.AXILITE_ADDR_WIDTH } {
 	# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
 	set_property value [get_property value ${PARAM_VALUE.AXILITE_ADDR_WIDTH}] ${MODELPARAM_VALUE.AXILITE_ADDR_WIDTH}
diff --git a/finn-rtllib/swg/swg_common.sv b/finn-rtllib/swg/swg_common.sv
new file mode 100644
index 0000000000..f2cdc333ca
--- /dev/null
+++ b/finn-rtllib/swg/swg_common.sv
@@ -0,0 +1,249 @@
+/******************************************************************************
+ * Copyright (C) 2022-2023, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+
+// loop controller used for both, "default" and "parallel", implementation styles
+module swg_controller
+import swg::*; #(
+    int unsigned  LOOP_H_ITERATIONS,
+    int unsigned  LOOP_W_ITERATIONS,
+    int unsigned  LOOP_KH_ITERATIONS,
+    int unsigned  LOOP_KW_ITERATIONS,
+    int unsigned  LOOP_SIMD_ITERATIONS,
+
+    int unsigned  INCR_BITWIDTH,
+
+    bit IS_DEPTHWISE,
+
+    int HEAD_INCR_SIMD,
+    int HEAD_INCR_KW,
+    int HEAD_INCR_KH,
+    int HEAD_INCR_W,
+    int HEAD_INCR_H,
+    int TAIL_INCR_W,
+    int TAIL_INCR_H,
+    int TAIL_INCR_LAST,
+
+    state_e INNERMOST_STATE
+)(
+    input   logic  clk,
+    input   logic  rst_n,
+
+    input   logic  advance,
+    output  logic [INCR_BITWIDTH-1:0]  addr_incr,
+    output  logic [INCR_BITWIDTH-1:0]  tail_incr
+);
+
+    // state and counters
+    state_e  State = INNERMOST_STATE;
+    state_e  state_next;
+
+    logic signed [$clog2(LOOP_H_ITERATIONS   +2)+1-1:0]  Counter_loop_h    = LOOP_H_ITERATIONS;
+    logic signed [$clog2(LOOP_W_ITERATIONS   +2)+1-1:0]  Counter_loop_w    = LOOP_W_ITERATIONS;
+    logic signed [$clog2(LOOP_KH_ITERATIONS  +2)+1-1:0]  Counter_loop_kh   = LOOP_KH_ITERATIONS;
+    logic signed [$clog2(LOOP_KW_ITERATIONS  +2)+1-1:0]  Counter_loop_kw   = LOOP_KW_ITERATIONS;
+    logic signed [$clog2(LOOP_SIMD_ITERATIONS+2)+1-1:0]  Counter_loop_simd = LOOP_SIMD_ITERATIONS;
+
+    // combinational logic for addr_incr generation
+    always_comb begin : blkHead
+        unique case (State)
+            STATE_START     : addr_incr = 0;
+            STATE_LOOP_SIMD : addr_incr = HEAD_INCR_SIMD;
+            STATE_LOOP_KW   : addr_incr = HEAD_INCR_KW;
+            STATE_LOOP_KH   : addr_incr = HEAD_INCR_KH;
+            STATE_LOOP_W    : addr_incr = HEAD_INCR_W;
+            STATE_LOOP_H    : addr_incr = HEAD_INCR_H;
+        endcase
+    end
+
+    // combinational logic for tail_incr generation
+    uwire  tail_incr_inner_condition = IS_DEPTHWISE? (Counter_loop_kh >= 0) : 0;
+    assign tail_incr =
+        tail_incr_inner_condition? 1 :
+        Counter_loop_w >= 0?       TAIL_INCR_W :
+        Counter_loop_h >= 0?       TAIL_INCR_H :
+        /* else */                 TAIL_INCR_LAST;
+
+    // combinational next state logic
+    always_comb begin : blkState
+        state_next = State;
+        if(State != INNERMOST_STATE)  state_next = INNERMOST_STATE;
+        else begin
+            if(Counter_loop_simd < 0) begin
+                state_next =
+                    (Counter_loop_kw >= 0)? STATE_LOOP_KW :
+                    (Counter_loop_kh >= 0)? STATE_LOOP_KH :
+                    (Counter_loop_w  >= 0)? STATE_LOOP_W :
+                    (Counter_loop_h  >= 0)? STATE_LOOP_H :
+                    /* else */              STATE_START;
+            end
+        end
+    end : blkState
+
+    // sequential logic
+    always_ff @ (posedge clk) begin
+        if(!rst_n) begin
+            State <= INNERMOST_STATE;
+            Counter_loop_h    <= LOOP_H_ITERATIONS;
+            Counter_loop_w    <= LOOP_W_ITERATIONS;
+            Counter_loop_kh   <= LOOP_KH_ITERATIONS;
+            Counter_loop_kw   <= LOOP_KW_ITERATIONS;
+            Counter_loop_simd <= LOOP_SIMD_ITERATIONS;
+        end
+        else if(advance) begin
+            State <= state_next;
+            if (State == INNERMOST_STATE) begin
+                if(Counter_loop_simd >= 0)  Counter_loop_simd <= Counter_loop_simd-1;
+                else begin
+                    Counter_loop_simd <= LOOP_SIMD_ITERATIONS;
+                    if(Counter_loop_kw >= 0)  Counter_loop_kw <= Counter_loop_kw-1;
+                    else begin
+                        Counter_loop_kw <= LOOP_KW_ITERATIONS;
+                        if(Counter_loop_kh >= 0)  Counter_loop_kh <= Counter_loop_kh-1;
+                        else begin
+                            Counter_loop_kh <= LOOP_KH_ITERATIONS;
+                            if(Counter_loop_w >= 0)  Counter_loop_w <= Counter_loop_w-1;
+                            else begin
+                                Counter_loop_w <= LOOP_W_ITERATIONS;
+                                if(Counter_loop_h >= 0)  Counter_loop_h <= Counter_loop_h-1;
+                                else  Counter_loop_h <= LOOP_H_ITERATIONS;
+                            end
+                        end
+                    end
+                end
+            end
+        end
+    end
+
+endmodule :  swg_controller
+
+// buffer used in "default" implementation style
+module swg_cyclic_buffer_addressable #(
+    int unsigned  WIDTH,
+    int unsigned  DEPTH,
+    parameter RAM_STYLE = "auto"
+)(
+    input   logic  clk,
+
+    input   logic  write_enable,
+    input   logic [$clog2(DEPTH)-1:0] write_addr,
+    input   logic [WIDTH-1:0]  data_in,
+
+    input   logic  read_enable,
+    input   logic [$clog2(DEPTH)-1:0]  read_addr, // absolute (!) read address of cyclic buffer
+    output  logic [WIDTH-1:0]  data_out
+);
+
+    (*ram_style=RAM_STYLE*) logic [WIDTH-1:0] Ram[DEPTH];
+    logic [WIDTH-1:0]  Out = 'x;
+    always_ff @(posedge clk) begin
+        if (read_enable)  Out <= Ram[read_addr];
+        if (write_enable) Ram[write_addr] <= data_in;
+    end
+    assign  data_out = Out;
+
+endmodule : swg_cyclic_buffer_addressable
+
+// buffer used in "parallel" implementation style
+module swg_reg_buffer
+#(
+    int unsigned WIDTH = 1,
+    int unsigned DEPTH = 1
+)
+(
+    input logic clk,
+    input logic shift_enable,
+    input logic [WIDTH-1:0] shift_in,
+    output logic [WIDTH-1:0] shift_out,
+    output logic [WIDTH*DEPTH-1:0] data_out
+);
+
+logic [WIDTH-1:0] Data [DEPTH-1:0];
+
+assign shift_out = Data[DEPTH-1];
+
+for (genvar e=0; e<DEPTH; e++)
+    assign data_out[e*WIDTH +: WIDTH] = Data[e];
+
+always @ (posedge clk) begin
+    if (shift_enable) begin
+        for (int i=DEPTH-1; i>0; i--)
+            Data[i] <= Data[i-1];
+        Data[0] <= shift_in;
+    end
+end
+endmodule : swg_reg_buffer
+
+// buffer used in "parallel" implementation style
+module swg_ram_buffer
+#(
+    int unsigned WIDTH,
+    int unsigned DEPTH,
+    parameter RAM_STYLE = "auto"
+)
+(
+    input logic clk,
+    input logic rst_n,
+    input logic shift_enable,
+    input logic [WIDTH-1:0] shift_in,
+    output logic [WIDTH-1:0] shift_out
+);
+
+logic [WIDTH-1:0] Out_reg;
+assign shift_out = Out_reg;
+
+logic [$clog2(DEPTH)-1:0] Addr_w = 0;
+logic [$clog2(DEPTH)-1:0] Addr_r = 0;
+
+(*ram_style=RAM_STYLE*) logic [WIDTH-1:0] Ram [DEPTH-1:0];
+
+always_ff @(posedge clk) begin
+    if (rst_n == 1'b0) begin
+        Addr_w <= 0;
+        Addr_r <= 1;
+    end else begin
+        if (shift_enable) begin
+            Ram[Addr_w] <= shift_in;
+            Out_reg <= Ram[Addr_r];
+
+            if (Addr_w == DEPTH-1)
+                Addr_w <= 0;
+            else
+                Addr_w <= Addr_w + 1;
+
+            if (Addr_r == DEPTH-1)
+                Addr_r <= 0;
+            else
+                Addr_r <= Addr_r + 1;
+        end
+    end
+end
+endmodule : swg_ram_buffer
diff --git a/finn-rtllib/swg/swg_pkg.sv b/finn-rtllib/swg/swg_pkg.sv
new file mode 100644
index 0000000000..1200310aca
--- /dev/null
+++ b/finn-rtllib/swg/swg_pkg.sv
@@ -0,0 +1,41 @@
+/******************************************************************************
+ * Copyright (C) 2023, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+package swg;
+	typedef enum logic [2:0] {
+		STATE_START,
+		STATE_LOOP_SIMD,
+		STATE_LOOP_KW,
+		STATE_LOOP_KH,
+		STATE_LOOP_W,
+		STATE_LOOP_H
+	} state_e;
+endpackage : swg
diff --git a/finn-rtllib/swg/swg_template_axilite.v b/finn-rtllib/swg/swg_template_axilite.v
new file mode 100644
index 0000000000..1f39e4440e
--- /dev/null
+++ b/finn-rtllib/swg/swg_template_axilite.v
@@ -0,0 +1,593 @@
+/******************************************************************************
+ * Copyright (C) 2022-2023, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+module $TOP_MODULE_NAME$_axilite #(
+    // Users to add parameters here
+
+    // User parameters ends
+    // Do not modify the parameters beyond this line
+
+    // Width of S_AXI data bus
+    parameter integer C_S_AXI_DATA_WIDTH	= 32,
+    // Width of S_AXI address bus
+    parameter integer C_S_AXI_ADDR_WIDTH	= 6
+)(
+    // Users to add ports here
+    output wire [C_S_AXI_DATA_WIDTH-1:0]	cfg_reg0,
+    output wire [C_S_AXI_DATA_WIDTH-1:0]	cfg_reg1,
+    output wire [C_S_AXI_DATA_WIDTH-1:0]	cfg_reg2,
+    output wire [C_S_AXI_DATA_WIDTH-1:0]	cfg_reg3,
+    output wire [C_S_AXI_DATA_WIDTH-1:0]	cfg_reg4,
+    output wire [C_S_AXI_DATA_WIDTH-1:0]	cfg_reg5,
+    output wire [C_S_AXI_DATA_WIDTH-1:0]	cfg_reg6,
+    output wire [C_S_AXI_DATA_WIDTH-1:0]	cfg_reg7,
+    output wire [C_S_AXI_DATA_WIDTH-1:0]	cfg_reg8,
+    output wire [C_S_AXI_DATA_WIDTH-1:0]	cfg_reg9,
+    output wire [C_S_AXI_DATA_WIDTH-1:0]	cfg_reg10,
+    output wire [C_S_AXI_DATA_WIDTH-1:0]	cfg_reg11,
+    output wire [C_S_AXI_DATA_WIDTH-1:0]	cfg_reg12,
+    output wire [C_S_AXI_DATA_WIDTH-1:0]	cfg_reg13,
+    output wire [C_S_AXI_DATA_WIDTH-1:0]	cfg_reg14,
+    output wire [C_S_AXI_DATA_WIDTH-1:0]	cfg_reg15,
+
+    // User ports ends
+    // Do not modify the ports beyond this line
+
+    // Global Clock Signal
+    input wire  S_AXI_ACLK,
+    // Global Reset Signal. This Signal is Active LOW
+    input wire  S_AXI_ARESETN,
+    // Write address (issued by master, acceped by Slave)
+    input wire [C_S_AXI_ADDR_WIDTH-1 : 0] S_AXI_AWADDR,
+    // Write channel Protection type. This signal indicates the
+        // privilege and security level of the transaction, and whether
+        // the transaction is a data access or an instruction access.
+    input wire [2 : 0] S_AXI_AWPROT,
+    // Write address valid. This signal indicates that the master signaling
+        // valid write address and control information.
+    input wire  S_AXI_AWVALID,
+    // Write address ready. This signal indicates that the slave is ready
+        // to accept an address and associated control signals.
+    output wire  S_AXI_AWREADY,
+    // Write data (issued by master, acceped by Slave)
+    input wire [C_S_AXI_DATA_WIDTH-1 : 0] S_AXI_WDATA,
+    // Write strobes. This signal indicates which byte lanes hold
+        // valid data. There is one write strobe bit for each eight
+        // bits of the write data bus.
+    input wire [(C_S_AXI_DATA_WIDTH/8)-1 : 0] S_AXI_WSTRB,
+    // Write valid. This signal indicates that valid write
+        // data and strobes are available.
+    input wire  S_AXI_WVALID,
+    // Write ready. This signal indicates that the slave
+        // can accept the write data.
+    output wire  S_AXI_WREADY,
+    // Write response. This signal indicates the status
+        // of the write transaction.
+    output wire [1 : 0] S_AXI_BRESP,
+    // Write response valid. This signal indicates that the channel
+        // is signaling a valid write response.
+    output wire  S_AXI_BVALID,
+    // Response ready. This signal indicates that the master
+        // can accept a write response.
+    input wire  S_AXI_BREADY,
+    // Read address (issued by master, acceped by Slave)
+    input wire [C_S_AXI_ADDR_WIDTH-1 : 0] S_AXI_ARADDR,
+    // Protection type. This signal indicates the privilege
+        // and security level of the transaction, and whether the
+        // transaction is a data access or an instruction access.
+    input wire [2 : 0] S_AXI_ARPROT,
+    // Read address valid. This signal indicates that the channel
+        // is signaling valid read address and control information.
+    input wire  S_AXI_ARVALID,
+    // Read address ready. This signal indicates that the slave is
+        // ready to accept an address and associated control signals.
+    output wire  S_AXI_ARREADY,
+    // Read data (issued by slave)
+    output wire [C_S_AXI_DATA_WIDTH-1 : 0] S_AXI_RDATA,
+    // Read response. This signal indicates the status of the
+        // read transfer.
+    output wire [1 : 0] S_AXI_RRESP,
+    // Read valid. This signal indicates that the channel is
+        // signaling the required read data.
+    output wire  S_AXI_RVALID,
+    // Read ready. This signal indicates that the master can
+        // accept the read data and response information.
+    input wire  S_AXI_RREADY
+);
+
+// AXI4LITE signals
+reg [C_S_AXI_ADDR_WIDTH-1 : 0] 	axi_awaddr;
+reg  	axi_awready;
+reg  	axi_wready;
+reg [1 : 0] 	axi_bresp;
+reg  	axi_bvalid;
+reg [C_S_AXI_ADDR_WIDTH-1 : 0] 	axi_araddr;
+reg  	axi_arready;
+reg [C_S_AXI_DATA_WIDTH-1 : 0] 	axi_rdata;
+reg [1 : 0] 	axi_rresp;
+reg  	axi_rvalid;
+
+// Example-specific design signals
+// local parameter for addressing 32 bit / 64 bit C_S_AXI_DATA_WIDTH
+// ADDR_LSB is used for addressing 32/64 bit registers/memories
+// ADDR_LSB = 2 for 32 bits (n downto 2)
+// ADDR_LSB = 3 for 64 bits (n downto 3)
+localparam integer ADDR_LSB = (C_S_AXI_DATA_WIDTH/32) + 1;
+localparam integer OPT_MEM_ADDR_BITS = 3;
+//----------------------------------------------
+//-- Signals for user logic register space example
+//------------------------------------------------
+//-- Number of Slave Registers 16
+reg [C_S_AXI_DATA_WIDTH-1:0]	slv_reg0;
+reg [C_S_AXI_DATA_WIDTH-1:0]	slv_reg1;
+reg [C_S_AXI_DATA_WIDTH-1:0]	slv_reg2;
+reg [C_S_AXI_DATA_WIDTH-1:0]	slv_reg3;
+reg [C_S_AXI_DATA_WIDTH-1:0]	slv_reg4;
+reg [C_S_AXI_DATA_WIDTH-1:0]	slv_reg5;
+reg [C_S_AXI_DATA_WIDTH-1:0]	slv_reg6;
+reg [C_S_AXI_DATA_WIDTH-1:0]	slv_reg7;
+reg [C_S_AXI_DATA_WIDTH-1:0]	slv_reg8;
+reg [C_S_AXI_DATA_WIDTH-1:0]	slv_reg9;
+reg [C_S_AXI_DATA_WIDTH-1:0]	slv_reg10;
+reg [C_S_AXI_DATA_WIDTH-1:0]	slv_reg11;
+reg [C_S_AXI_DATA_WIDTH-1:0]	slv_reg12;
+reg [C_S_AXI_DATA_WIDTH-1:0]	slv_reg13;
+reg [C_S_AXI_DATA_WIDTH-1:0]	slv_reg14;
+reg [C_S_AXI_DATA_WIDTH-1:0]	slv_reg15;
+wire	 slv_reg_rden;
+wire	 slv_reg_wren;
+reg [C_S_AXI_DATA_WIDTH-1:0]	 reg_data_out;
+integer	 byte_index;
+reg	 aw_en;
+
+// I/O Connections assignments
+
+assign S_AXI_AWREADY	= axi_awready;
+assign S_AXI_WREADY	= axi_wready;
+assign S_AXI_BRESP	= axi_bresp;
+assign S_AXI_BVALID	= axi_bvalid;
+assign S_AXI_ARREADY	= axi_arready;
+assign S_AXI_RDATA	= axi_rdata;
+assign S_AXI_RRESP	= axi_rresp;
+assign S_AXI_RVALID	= axi_rvalid;
+// Implement axi_awready generation
+// axi_awready is asserted for one S_AXI_ACLK clock cycle when both
+// S_AXI_AWVALID and S_AXI_WVALID are asserted. axi_awready is
+// de-asserted when reset is low.
+
+always @( posedge S_AXI_ACLK )
+begin
+    if ( S_AXI_ARESETN == 1'b0 )
+    begin
+        axi_awready <= 1'b0;
+        aw_en <= 1'b1;
+    end
+    else
+    begin
+        if (~axi_awready && S_AXI_AWVALID && S_AXI_WVALID && aw_en)
+        begin
+            // slave is ready to accept write address when
+            // there is a valid write address and write data
+            // on the write address and data bus. This design
+            // expects no outstanding transactions.
+            axi_awready <= 1'b1;
+            aw_en <= 1'b0;
+        end
+        else if (S_AXI_BREADY && axi_bvalid)
+            begin
+                aw_en <= 1'b1;
+                axi_awready <= 1'b0;
+            end
+        else
+        begin
+            axi_awready <= 1'b0;
+        end
+    end
+end
+
+// Implement axi_awaddr latching
+// This process is used to latch the address when both
+// S_AXI_AWVALID and S_AXI_WVALID are valid.
+
+always @( posedge S_AXI_ACLK )
+begin
+    if ( S_AXI_ARESETN == 1'b0 )
+    begin
+        axi_awaddr <= 0;
+    end
+    else
+    begin
+        if (~axi_awready && S_AXI_AWVALID && S_AXI_WVALID && aw_en)
+        begin
+            // Write Address latching
+            axi_awaddr <= S_AXI_AWADDR;
+        end
+    end
+end
+
+// Implement axi_wready generation
+// axi_wready is asserted for one S_AXI_ACLK clock cycle when both
+// S_AXI_AWVALID and S_AXI_WVALID are asserted. axi_wready is
+// de-asserted when reset is low.
+
+always @( posedge S_AXI_ACLK )
+begin
+    if ( S_AXI_ARESETN == 1'b0 )
+    begin
+        axi_wready <= 1'b0;
+    end
+    else
+    begin
+        if (~axi_wready && S_AXI_WVALID && S_AXI_AWVALID && aw_en )
+        begin
+            // slave is ready to accept write data when
+            // there is a valid write address and write data
+            // on the write address and data bus. This design
+            // expects no outstanding transactions.
+            axi_wready <= 1'b1;
+        end
+        else
+        begin
+            axi_wready <= 1'b0;
+        end
+    end
+end
+
+// Implement memory mapped register select and write logic generation
+// The write data is accepted and written to memory mapped registers when
+// axi_awready, S_AXI_WVALID, axi_wready and S_AXI_WVALID are asserted. Write strobes are used to
+// select byte enables of slave registers while writing.
+// These registers are cleared when reset (active low) is applied.
+// Slave register write enable is asserted when valid address and data are available
+// and the slave is ready to accept the write address and write data.
+assign slv_reg_wren = axi_wready && S_AXI_WVALID && axi_awready && S_AXI_AWVALID;
+
+always @( posedge S_AXI_ACLK )
+begin
+    if ( S_AXI_ARESETN == 1'b0 )
+    begin
+        slv_reg0 <= 0;
+        slv_reg1 <= 0;
+        slv_reg2 <= 0;
+        slv_reg3 <= 0;
+        slv_reg4 <= 0;
+        slv_reg5 <= 0;
+        slv_reg6 <= 0;
+        slv_reg7 <= 0;
+        slv_reg8 <= 0;
+        slv_reg9 <= 0;
+        slv_reg10 <= 0;
+        slv_reg11 <= 0;
+        slv_reg12 <= 0;
+        slv_reg13 <= 0;
+        slv_reg14 <= 0;
+        slv_reg15 <= 0;
+    end
+    else begin
+    if (slv_reg_wren)
+        begin
+        case ( axi_awaddr[ADDR_LSB+OPT_MEM_ADDR_BITS:ADDR_LSB] )
+            4'h0:
+            for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
+                if ( S_AXI_WSTRB[byte_index] == 1 ) begin
+                // Respective byte enables are asserted as per write strobes
+                // Slave register 0
+                slv_reg0[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
+                end
+            4'h1:
+            for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
+                if ( S_AXI_WSTRB[byte_index] == 1 ) begin
+                // Respective byte enables are asserted as per write strobes
+                // Slave register 1
+                slv_reg1[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
+                end
+            4'h2:
+            for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
+                if ( S_AXI_WSTRB[byte_index] == 1 ) begin
+                // Respective byte enables are asserted as per write strobes
+                // Slave register 2
+                slv_reg2[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
+                end
+            4'h3:
+            for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
+                if ( S_AXI_WSTRB[byte_index] == 1 ) begin
+                // Respective byte enables are asserted as per write strobes
+                // Slave register 3
+                slv_reg3[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
+                end
+            4'h4:
+            for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
+                if ( S_AXI_WSTRB[byte_index] == 1 ) begin
+                // Respective byte enables are asserted as per write strobes
+                // Slave register 4
+                slv_reg4[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
+                end
+            4'h5:
+            for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
+                if ( S_AXI_WSTRB[byte_index] == 1 ) begin
+                // Respective byte enables are asserted as per write strobes
+                // Slave register 5
+                slv_reg5[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
+                end
+            4'h6:
+            for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
+                if ( S_AXI_WSTRB[byte_index] == 1 ) begin
+                // Respective byte enables are asserted as per write strobes
+                // Slave register 6
+                slv_reg6[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
+                end
+            4'h7:
+            for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
+                if ( S_AXI_WSTRB[byte_index] == 1 ) begin
+                // Respective byte enables are asserted as per write strobes
+                // Slave register 7
+                slv_reg7[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
+                end
+            4'h8:
+            for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
+                if ( S_AXI_WSTRB[byte_index] == 1 ) begin
+                // Respective byte enables are asserted as per write strobes
+                // Slave register 8
+                slv_reg8[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
+                end
+            4'h9:
+            for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
+                if ( S_AXI_WSTRB[byte_index] == 1 ) begin
+                // Respective byte enables are asserted as per write strobes
+                // Slave register 9
+                slv_reg9[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
+                end
+            4'hA:
+            for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
+                if ( S_AXI_WSTRB[byte_index] == 1 ) begin
+                // Respective byte enables are asserted as per write strobes
+                // Slave register 10
+                slv_reg10[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
+                end
+            4'hB:
+            for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
+                if ( S_AXI_WSTRB[byte_index] == 1 ) begin
+                // Respective byte enables are asserted as per write strobes
+                // Slave register 11
+                slv_reg11[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
+                end
+            4'hC:
+            for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
+                if ( S_AXI_WSTRB[byte_index] == 1 ) begin
+                // Respective byte enables are asserted as per write strobes
+                // Slave register 12
+                slv_reg12[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
+                end
+            4'hD:
+            for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
+                if ( S_AXI_WSTRB[byte_index] == 1 ) begin
+                // Respective byte enables are asserted as per write strobes
+                // Slave register 13
+                slv_reg13[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
+                end
+            4'hE:
+            for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
+                if ( S_AXI_WSTRB[byte_index] == 1 ) begin
+                // Respective byte enables are asserted as per write strobes
+                // Slave register 14
+                slv_reg14[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
+                end
+            4'hF:
+            for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
+                if ( S_AXI_WSTRB[byte_index] == 1 ) begin
+                // Respective byte enables are asserted as per write strobes
+                // Slave register 15
+                slv_reg15[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
+                end
+            default : begin
+                        slv_reg0 <= slv_reg0;
+                        slv_reg1 <= slv_reg1;
+                        slv_reg2 <= slv_reg2;
+                        slv_reg3 <= slv_reg3;
+                        slv_reg4 <= slv_reg4;
+                        slv_reg5 <= slv_reg5;
+                        slv_reg6 <= slv_reg6;
+                        slv_reg7 <= slv_reg7;
+                        slv_reg8 <= slv_reg8;
+                        slv_reg9 <= slv_reg9;
+                        slv_reg10 <= slv_reg10;
+                        slv_reg11 <= slv_reg11;
+                        slv_reg12 <= slv_reg12;
+                        slv_reg13 <= slv_reg13;
+                        slv_reg14 <= slv_reg14;
+                        slv_reg15 <= slv_reg15;
+                    end
+        endcase
+        end
+    end
+end
+
+// Implement write response logic generation
+// The write response and response valid signals are asserted by the slave
+// when axi_wready, S_AXI_WVALID, axi_wready and S_AXI_WVALID are asserted.
+// This marks the acceptance of address and indicates the status of
+// write transaction.
+
+always @( posedge S_AXI_ACLK )
+begin
+    if ( S_AXI_ARESETN == 1'b0 )
+    begin
+        axi_bvalid  <= 0;
+        axi_bresp   <= 2'b0;
+    end
+    else
+    begin
+        if (axi_awready && S_AXI_AWVALID && ~axi_bvalid && axi_wready && S_AXI_WVALID)
+        begin
+            // indicates a valid write response is available
+            axi_bvalid <= 1'b1;
+            axi_bresp  <= 2'b0; // 'OKAY' response
+        end                   // work error responses in future
+        else
+        begin
+            if (S_AXI_BREADY && axi_bvalid)
+            //check if bready is asserted while bvalid is high)
+            //(there is a possibility that bready is always asserted high)
+            begin
+                axi_bvalid <= 1'b0;
+            end
+        end
+    end
+end
+
+// Implement axi_arready generation
+// axi_arready is asserted for one S_AXI_ACLK clock cycle when
+// S_AXI_ARVALID is asserted. axi_awready is
+// de-asserted when reset (active low) is asserted.
+// The read address is also latched when S_AXI_ARVALID is
+// asserted. axi_araddr is reset to zero on reset assertion.
+
+always @( posedge S_AXI_ACLK )
+begin
+    if ( S_AXI_ARESETN == 1'b0 )
+    begin
+        axi_arready <= 1'b0;
+        axi_araddr  <= 32'b0;
+    end
+    else
+    begin
+        if (~axi_arready && S_AXI_ARVALID)
+        begin
+            // indicates that the slave has acceped the valid read address
+            axi_arready <= 1'b1;
+            // Read address latching
+            axi_araddr  <= S_AXI_ARADDR;
+        end
+        else
+        begin
+            axi_arready <= 1'b0;
+        end
+    end
+end
+
+// Implement axi_arvalid generation
+// axi_rvalid is asserted for one S_AXI_ACLK clock cycle when both
+// S_AXI_ARVALID and axi_arready are asserted. The slave registers
+// data are available on the axi_rdata bus at this instance. The
+// assertion of axi_rvalid marks the validity of read data on the
+// bus and axi_rresp indicates the status of read transaction.axi_rvalid
+// is deasserted on reset (active low). axi_rresp and axi_rdata are
+// cleared to zero on reset (active low).
+always @( posedge S_AXI_ACLK )
+begin
+    if ( S_AXI_ARESETN == 1'b0 )
+    begin
+        axi_rvalid <= 0;
+        axi_rresp  <= 0;
+    end
+    else
+    begin
+        if (axi_arready && S_AXI_ARVALID && ~axi_rvalid)
+        begin
+            // Valid read data is available at the read data bus
+            axi_rvalid <= 1'b1;
+            axi_rresp  <= 2'b0; // 'OKAY' response
+        end
+        else if (axi_rvalid && S_AXI_RREADY)
+        begin
+            // Read data is accepted by the master
+            axi_rvalid <= 1'b0;
+        end
+    end
+end
+
+// Implement memory mapped register select and read logic generation
+// Slave register read enable is asserted when valid address is available
+// and the slave is ready to accept the read address.
+assign slv_reg_rden = axi_arready & S_AXI_ARVALID & ~axi_rvalid;
+always @(*)
+begin
+        // Address decoding for reading registers
+        case ( axi_araddr[ADDR_LSB+OPT_MEM_ADDR_BITS:ADDR_LSB] )
+        4'h0   : reg_data_out <= slv_reg0;
+        4'h1   : reg_data_out <= slv_reg1;
+        4'h2   : reg_data_out <= slv_reg2;
+        4'h3   : reg_data_out <= slv_reg3;
+        4'h4   : reg_data_out <= slv_reg4;
+        4'h5   : reg_data_out <= slv_reg5;
+        4'h6   : reg_data_out <= slv_reg6;
+        4'h7   : reg_data_out <= slv_reg7;
+        4'h8   : reg_data_out <= slv_reg8;
+        4'h9   : reg_data_out <= slv_reg9;
+        4'hA   : reg_data_out <= slv_reg10;
+        4'hB   : reg_data_out <= slv_reg11;
+        4'hC   : reg_data_out <= slv_reg12;
+        4'hD   : reg_data_out <= slv_reg13;
+        4'hE   : reg_data_out <= slv_reg14;
+        4'hF   : reg_data_out <= slv_reg15;
+        default : reg_data_out <= 0;
+        endcase
+end
+
+// Output register or memory read data
+always @( posedge S_AXI_ACLK )
+begin
+    if ( S_AXI_ARESETN == 1'b0 )
+    begin
+        axi_rdata  <= 0;
+    end
+    else
+    begin
+        // When there is a valid read address (S_AXI_ARVALID) with
+        // acceptance of read address by the slave (axi_arready),
+        // output the read dada
+        if (slv_reg_rden)
+        begin
+            axi_rdata <= reg_data_out;     // register read data
+        end
+    end
+end
+
+// Add user logic here
+assign	cfg_reg0 = slv_reg0;
+assign	cfg_reg1 = slv_reg1;
+assign	cfg_reg2 = slv_reg2;
+assign	cfg_reg3 = slv_reg3;
+assign	cfg_reg4 = slv_reg4;
+assign	cfg_reg5 = slv_reg5;
+assign	cfg_reg6 = slv_reg6;
+assign	cfg_reg7 = slv_reg7;
+assign	cfg_reg8 = slv_reg8;
+assign	cfg_reg9 = slv_reg9;
+assign	cfg_reg10 = slv_reg10;
+assign	cfg_reg11 = slv_reg11;
+assign	cfg_reg12 = slv_reg12;
+assign	cfg_reg13 = slv_reg13;
+assign	cfg_reg14 = slv_reg14;
+assign	cfg_reg15 = slv_reg15;
+// User logic ends
+
+endmodule
diff --git a/finn-rtllib/swg/swg_template_default.sv b/finn-rtllib/swg/swg_template_default.sv
new file mode 100644
index 0000000000..78a8d0a3b9
--- /dev/null
+++ b/finn-rtllib/swg/swg_template_default.sv
@@ -0,0 +1,237 @@
+/******************************************************************************
+ * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+module $TOP_MODULE_NAME$_impl #(
+    int  BIT_WIDTH,
+    int  SIMD,
+    int  MMV_IN,
+    int  MMV_OUT,
+    int  LAST_READ_ELEM = $LAST_READ_ELEM$,
+    int  LAST_WRITE_ELEM = $LAST_WRITE_ELEM$,
+    int  BUF_ELEM_TOTAL = $BUF_ELEM_TOTAL$,
+    int  ELEM_PER_WINDOW = $ELEM_PER_WINDOW$,
+    int  INCR_BITWIDTH = $INCR_BITWIDTH$
+)(
+    input   logic  ap_clk,
+    input   logic  ap_rst_n,
+
+    input   logic  in0_V_V_TVALID,
+    output  logic  in0_V_V_TREADY,
+    input   logic [BIT_WIDTH * SIMD * MMV_IN-1:0]  in0_V_V_TDATA,
+
+    output  logic  out_V_V_TVALID,
+    input   logic  out_V_V_TREADY,
+    output  logic [BIT_WIDTH * SIMD * MMV_OUT-1:0]  out_V_V_TDATA
+);
+    // derived constants
+    localparam int unsigned  BUF_IN_WIDTH = BIT_WIDTH * SIMD * MMV_IN;
+    localparam int unsigned  BUF_OUT_ELEM_WIDTH = BIT_WIDTH * SIMD;
+    localparam int unsigned  BUF_OUT_WIDTH = BIT_WIDTH * SIMD * MMV_OUT;
+
+   // main buffer instantiation
+    uwire [BUF_IN_WIDTH -1:0]  window_buffer_in;
+    uwire [BUF_OUT_WIDTH-1:0]  window_buffer_out;
+    uwire  window_buffer_write_enable;
+    uwire  window_buffer_read_enable;
+    uwire [$clog2(BUF_ELEM_TOTAL)-1:0]  window_buffer_write_addr;
+    uwire [$clog2(BUF_ELEM_TOTAL)-1:0]  window_buffer_read_addr;
+    swg_cyclic_buffer_addressable #(
+        .WIDTH(BUF_IN_WIDTH),
+        .DEPTH(BUF_ELEM_TOTAL),
+        .RAM_STYLE($RAM_STYLE$)
+    ) window_buffer_inst (
+        .clk(ap_clk),
+
+        .write_enable(window_buffer_write_enable),
+        .write_addr(window_buffer_write_addr),
+        .data_in(window_buffer_in),
+
+        .read_enable(window_buffer_read_enable),
+        .read_addr(window_buffer_read_addr),
+        .data_out(window_buffer_out)
+    );
+
+    //controller instantiation
+    uwire  advance_controller;
+    uwire signed [INCR_BITWIDTH-1:0]  addr_incr;
+    uwire        [INCR_BITWIDTH-1:0]  tail_incr;
+    swg_controller #(
+        .LOOP_H_ITERATIONS($LOOP_H_ITERATIONS$),
+        .LOOP_W_ITERATIONS($LOOP_W_ITERATIONS$),
+        .LOOP_KH_ITERATIONS($LOOP_KH_ITERATIONS$),
+        .LOOP_KW_ITERATIONS($LOOP_KW_ITERATIONS$),
+        .LOOP_SIMD_ITERATIONS($LOOP_SIMD_ITERATIONS$),
+        .HEAD_INCR_SIMD($HEAD_INCR_SIMD$),
+        .HEAD_INCR_KW($HEAD_INCR_KW$),
+        .HEAD_INCR_KH($HEAD_INCR_KH$),
+        .HEAD_INCR_W($HEAD_INCR_W$),
+        .HEAD_INCR_H($HEAD_INCR_H$),
+        .TAIL_INCR_W($TAIL_INCR_W$),
+        .TAIL_INCR_H($TAIL_INCR_H$),
+        .TAIL_INCR_LAST($TAIL_INCR_LAST$),
+        .INCR_BITWIDTH($INCR_BITWIDTH$),
+        .IS_DEPTHWISE($IS_DEPTHWISE$),
+        .INNERMOST_STATE(swg::$INNERMOST_STATE$)
+    )
+    controller_inst (
+        .clk(ap_clk),
+        .rst_n(ap_rst_n),
+        .advance(advance_controller),
+        .addr_incr(addr_incr),
+        .tail_incr(tail_incr)
+    );
+
+    // Counters/address registers
+    // Add a sign bit even to (most) unsigned counters and Window_buffer_read_addr_reg,
+    // so we can use automatic sign extension and simplify calculations w/ signed increment.
+    // Alternatively, we could manually sign-extend and shave off a bit here or there.
+    logic signed [$clog2(LAST_READ_ELEM+1)+1-1:0]  Newest_buffered_elem = -1;
+    logic        [$clog2(LAST_READ_ELEM+1)+1-1:0]  Current_elem = 0;
+    logic        [$clog2(LAST_READ_ELEM+1)+1-1:0]  First_elem_next_window = 0;
+    logic        [$clog2(ELEM_PER_WINDOW)   -1:0]  Position_in_window = 0;
+    logic        [$clog2(BUF_ELEM_TOTAL)+1  -1:0]  Window_buffer_read_addr_reg = 0;
+    logic        [$clog2(BUF_ELEM_TOTAL)-1:0]      Window_buffer_write_addr_reg = 0;
+
+    // Control signals/registers
+    logic  Write_cmd    = 0;
+    logic  Writing_done = 0;
+    uwire  write_ok      = Write_cmd &&  out_V_V_TREADY;
+    uwire  write_blocked = Write_cmd && !out_V_V_TREADY;
+
+    logic  Fetching_done = 0;
+    uwire  fetch_cmd = !($signed(Current_elem) > Newest_buffered_elem) && !write_blocked && !Fetching_done;
+
+    uwire  reading_done = Newest_buffered_elem == LAST_READ_ELEM;
+    uwire  read_cmd =
+        !reading_done && ( // if there is still an input element left to read
+            Fetching_done || ( // if fetching is done (e.g. for skipped rows at FM end due to stride)
+                $signed(((Newest_buffered_elem - (BUF_ELEM_TOTAL - 1)))) < $signed(First_elem_next_window) &&
+                $signed(((Newest_buffered_elem - (BUF_ELEM_TOTAL - 1)))) < $signed(Current_elem)
+            ) // (over-)write to buffer if oldest buffered element will no longer be needed
+        );
+    uwire  read_ok      = read_cmd && in0_V_V_TVALID;
+
+    //assign buffer control
+    assign  window_buffer_write_addr = Window_buffer_write_addr_reg;
+    assign  window_buffer_read_addr = Window_buffer_read_addr_reg;
+    assign  window_buffer_write_enable = read_ok;
+    assign  window_buffer_read_enable = fetch_cmd;
+    assign  advance_controller = fetch_cmd;
+
+    //assign I/O ports
+    assign  window_buffer_in = in0_V_V_TDATA;
+    assign  out_V_V_TDATA = window_buffer_out;
+    assign  in0_V_V_TREADY = ap_rst_n && read_ok; //only asserted if data is available and we can store it (allowed)
+    assign  out_V_V_TVALID = ap_rst_n && Write_cmd; //only asserted if we have data available and it has not been read yet (don't wait for READY from sink)
+
+    //main process for advancing counters
+    always_ff @(posedge ap_clk) begin
+        if(!ap_rst_n) begin
+            Newest_buffered_elem <= -1;
+            Current_elem <= 0;
+            First_elem_next_window <= 0;
+            Position_in_window <= 0;
+            Window_buffer_read_addr_reg <= 0;
+            Window_buffer_write_addr_reg <= 0;
+            Fetching_done <= 0;
+            Write_cmd <= 0;
+            Writing_done <= 0;
+        end
+        else begin
+            if (read_ok) begin
+                Window_buffer_write_addr_reg <= (Window_buffer_write_addr_reg == BUF_ELEM_TOTAL-1)? 0 : Window_buffer_write_addr_reg + 1;
+                Newest_buffered_elem <= Newest_buffered_elem+1;
+
+                if (Newest_buffered_elem == LAST_READ_ELEM-1) begin
+                    Window_buffer_write_addr_reg <= 0;
+                end
+                //check if this is the last read cycle (reading_done will be true afterwards)
+                if ((Newest_buffered_elem == LAST_READ_ELEM-1) && Writing_done) begin
+                    //start processing of next FM if writing is done already (possible due to unused input elements at the tail end)
+                    //todo: allow for read overlapping between feature maps (i.e., reading first elements from next FM while still writing last window of current FM)
+                    Newest_buffered_elem <= -1;
+                    Current_elem <= 0;
+                    Window_buffer_read_addr_reg <= 0;
+                    First_elem_next_window <= 0;
+                    Writing_done <= 0;
+                    Fetching_done <= 0;
+                end
+            end
+
+            if (fetch_cmd) begin
+                //count up to track which element index is about to be read from the buffer, and where it is located within the buffer
+                //use increment value calculated by controller
+
+                // absolute buffer address wrap-around
+                automatic logic signed [$clog2(BUF_ELEM_TOTAL)+1:0]  ra = $signed(Window_buffer_read_addr_reg) + $signed(addr_incr);
+                automatic logic signed [$clog2(BUF_ELEM_TOTAL+1):0]  ra_correct =
+                    (ra >= BUF_ELEM_TOTAL)? -BUF_ELEM_TOTAL :
+                    (ra <               0)?  BUF_ELEM_TOTAL : 0;
+                Window_buffer_read_addr_reg <= ra + ra_correct;
+
+                //keep track where we are within a window
+                Position_in_window <= (Position_in_window != ELEM_PER_WINDOW - 1)? Position_in_window+1 : 0;
+
+                //update first element of next window to allow buffer overwrite up until that point
+                if (Position_in_window == 0)
+                    First_elem_next_window <= First_elem_next_window + tail_incr;
+
+                //check if this is the last write cycle (Writing_done will be true afterwards)
+                if (Current_elem == LAST_WRITE_ELEM)
+                    Fetching_done <= 1;
+                else
+                    Current_elem <= $signed(Current_elem) + addr_incr;
+
+                // determine if prefetched data will be outstanding in the next cycle
+                // if we fetch in this cycle -> yes
+                // if we do not fetch nor write -> do not change
+                // if we do not fetch but write successfully-> clear outstanding data
+                Write_cmd <= fetch_cmd;
+            end
+
+            if (write_ok)
+                Write_cmd <= fetch_cmd;
+
+            if (write_ok && Fetching_done) begin
+                //check if this is the last write cycle (Writing_done will be true afterwards)
+                if (reading_done || (read_ok && (Newest_buffered_elem == LAST_READ_ELEM - 1))) begin
+                    //start processing of next FM if reading is done already, or completes in the same cycle
+                    Newest_buffered_elem <= -1;
+                    Current_elem <= 0;
+                    Window_buffer_read_addr_reg <= 0;
+                    First_elem_next_window <= 0;
+                    Fetching_done <= 0;
+                end else
+                    Writing_done <= 1;
+            end
+        end
+    end
+
+endmodule : $TOP_MODULE_NAME$_impl
diff --git a/finn-rtllib/swg/swg_template_default_dynamic.sv b/finn-rtllib/swg/swg_template_default_dynamic.sv
new file mode 100644
index 0000000000..5a6fdda170
--- /dev/null
+++ b/finn-rtllib/swg/swg_template_default_dynamic.sv
@@ -0,0 +1,417 @@
+/******************************************************************************
+ * Copyright (C) 2022-2023, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+module $TOP_MODULE_NAME$_controller #(
+    int unsigned  CNTR_BITWIDTH,
+    int unsigned  INCR_BITWIDTH,
+
+    bit IS_DEPTHWISE = $IS_DEPTHWISE$
+)(
+    input   logic  clk,
+    input   logic  rst_n,
+
+    input   logic  advance,
+    output  logic [INCR_BITWIDTH-1:0]  addr_incr,
+    output  logic [INCR_BITWIDTH-1:0]  tail_incr,
+
+    input logic                     cfg_valid,
+    input logic [CNTR_BITWIDTH-1:0] cfg_cntr_simd,
+    input logic [CNTR_BITWIDTH-1:0] cfg_cntr_kw,
+    input logic [CNTR_BITWIDTH-1:0] cfg_cntr_kh,
+    input logic [CNTR_BITWIDTH-1:0] cfg_cntr_w,
+    input logic [CNTR_BITWIDTH-1:0] cfg_cntr_h,
+    input logic [INCR_BITWIDTH-1:0] cfg_incr_head_simd,
+    input logic [INCR_BITWIDTH-1:0] cfg_incr_head_kw,
+    input logic [INCR_BITWIDTH-1:0] cfg_incr_head_kh,
+    input logic [INCR_BITWIDTH-1:0] cfg_incr_head_w,
+    input logic [INCR_BITWIDTH-1:0] cfg_incr_head_h,
+    input logic [INCR_BITWIDTH-1:0] cfg_incr_tail_w,
+    input logic [INCR_BITWIDTH-1:0] cfg_incr_tail_h,
+    input logic [INCR_BITWIDTH-1:0] cfg_incr_tail_last
+);
+
+    import  swg::*;
+
+    // (dynamic) configuration registers
+    logic [CNTR_BITWIDTH-1:0] Cfg_cntr_simd      = $LOOP_SIMD_ITERATIONS$;
+    logic [CNTR_BITWIDTH-1:0] Cfg_cntr_kw        = $LOOP_KW_ITERATIONS$;
+    logic [CNTR_BITWIDTH-1:0] Cfg_cntr_kh        = $LOOP_KH_ITERATIONS$;
+    logic [CNTR_BITWIDTH-1:0] Cfg_cntr_w         = $LOOP_W_ITERATIONS$;
+    logic [CNTR_BITWIDTH-1:0] Cfg_cntr_h         = $LOOP_H_ITERATIONS$;
+    logic [INCR_BITWIDTH-1:0] Cfg_incr_head_simd = $HEAD_INCR_SIMD$;
+    logic [INCR_BITWIDTH-1:0] Cfg_incr_head_kw   = $HEAD_INCR_KW$;
+    logic [INCR_BITWIDTH-1:0] Cfg_incr_head_kh   = $HEAD_INCR_KH$;
+    logic [INCR_BITWIDTH-1:0] Cfg_incr_head_w    = $HEAD_INCR_W$;
+    logic [INCR_BITWIDTH-1:0] Cfg_incr_head_h    = $HEAD_INCR_H$;
+    logic [INCR_BITWIDTH-1:0] Cfg_incr_tail_w    = $TAIL_INCR_W$;
+    logic [INCR_BITWIDTH-1:0] Cfg_incr_tail_h    = $TAIL_INCR_H$;
+    logic [INCR_BITWIDTH-1:0] Cfg_incr_tail_last = $TAIL_INCR_LAST$;
+
+    // configuration reset/set logic
+    always_ff @ (posedge clk) begin
+        if(cfg_valid) begin
+            Cfg_cntr_simd      <= cfg_cntr_simd;
+            Cfg_cntr_kw        <= cfg_cntr_kw;
+            Cfg_cntr_kh        <= cfg_cntr_kh;
+            Cfg_cntr_w         <= cfg_cntr_w;
+            Cfg_cntr_h         <= cfg_cntr_h;
+            Cfg_incr_head_simd <= cfg_incr_head_simd;
+            Cfg_incr_head_kw   <= cfg_incr_head_kw;
+            Cfg_incr_head_kh   <= cfg_incr_head_kh;
+            Cfg_incr_head_w    <= cfg_incr_head_w;
+            Cfg_incr_head_h    <= cfg_incr_head_h;
+            Cfg_incr_tail_w    <= cfg_incr_tail_w;
+            Cfg_incr_tail_h    <= cfg_incr_tail_h;
+            Cfg_incr_tail_last <= cfg_incr_tail_last;
+        end
+    end
+
+    // state and counters
+    state_e  State = $INNERMOST_STATE$;
+    state_e  state_next;
+
+    logic signed [$clog2($LOOP_H_ITERATIONS$   +2)+1-1:0]  Counter_loop_h    = $LOOP_H_ITERATIONS$;
+    logic signed [$clog2($LOOP_W_ITERATIONS$   +2)+1-1:0]  Counter_loop_w    = $LOOP_W_ITERATIONS$;
+    logic signed [$clog2($LOOP_KH_ITERATIONS$  +2)+1-1:0]  Counter_loop_kh   = $LOOP_KH_ITERATIONS$;
+    logic signed [$clog2($LOOP_KW_ITERATIONS$  +2)+1-1:0]  Counter_loop_kw   = $LOOP_KW_ITERATIONS$;
+    logic signed [$clog2($LOOP_SIMD_ITERATIONS$+2)+1-1:0]  Counter_loop_simd = $LOOP_SIMD_ITERATIONS$;
+
+    // combinational logic for addr_incr generation
+    always_comb begin : blkHead
+        unique case (State)
+            0 : addr_incr = 0;
+            1 : addr_incr = Cfg_incr_head_simd;
+            2 : addr_incr = Cfg_incr_head_kw;
+            3 : addr_incr = Cfg_incr_head_kh;
+            4 : addr_incr = Cfg_incr_head_w;
+            5 : addr_incr = Cfg_incr_head_h;
+        endcase
+    end
+
+    // combinational logic for tail_incr generation
+    uwire  tail_incr_inner_condition = IS_DEPTHWISE? (Counter_loop_kh >= 0) : 0;
+    assign tail_incr =
+        tail_incr_inner_condition? 1 :
+        Counter_loop_w >= 0?       Cfg_incr_tail_w :
+        Counter_loop_h >= 0?       Cfg_incr_tail_h :
+        /* else */                 Cfg_incr_tail_last;
+
+    // combinational next state logic
+    always_comb begin : blkState
+        state_next = State;
+        if(State != $INNERMOST_STATE$)  state_next = $INNERMOST_STATE$;
+        else begin
+            if(Counter_loop_simd < 0) begin
+                state_next =
+                    (Counter_loop_kw >= 0)? STATE_LOOP_KW :
+                    (Counter_loop_kh >= 0)? STATE_LOOP_KH :
+                    (Counter_loop_w  >= 0)? STATE_LOOP_W :
+                    (Counter_loop_h  >= 0)? STATE_LOOP_H :
+                    /* else */              STATE_START;
+            end
+        end
+    end : blkState
+
+    // sequential logic
+    always_ff @ (posedge clk) begin
+        if(!rst_n) begin
+            State <= $INNERMOST_STATE$;
+            Counter_loop_h    <= Cfg_cntr_h;
+            Counter_loop_w    <= Cfg_cntr_w;
+            Counter_loop_kh   <= Cfg_cntr_kh;
+            Counter_loop_kw   <= Cfg_cntr_kw;
+            Counter_loop_simd <= Cfg_cntr_simd;
+        end
+        else if(advance) begin
+            State <= state_next;
+            if (State == $INNERMOST_STATE$) begin
+                if(Counter_loop_simd >= 0)  Counter_loop_simd <= Counter_loop_simd-1;
+                else begin
+                    Counter_loop_simd <= Cfg_cntr_simd;
+                    if(Counter_loop_kw >= 0)  Counter_loop_kw <= Counter_loop_kw-1;
+                    else begin
+                        Counter_loop_kw <= Cfg_cntr_kw;
+                        if(Counter_loop_kh >= 0)  Counter_loop_kh <= Counter_loop_kh-1;
+                        else begin
+                            Counter_loop_kh <= Cfg_cntr_kh;
+                            if(Counter_loop_w >= 0)  Counter_loop_w <= Counter_loop_w-1;
+                            else begin
+                                Counter_loop_w <= Cfg_cntr_w;
+                                if(Counter_loop_h >= 0)  Counter_loop_h <= Counter_loop_h-1;
+                                else  Counter_loop_h <= Cfg_cntr_h;
+                            end
+                        end
+                    end
+                end
+            end
+        end
+    end
+
+endmodule :  $TOP_MODULE_NAME$_controller
+
+module $TOP_MODULE_NAME$_impl #(
+    int  BIT_WIDTH,
+    int  SIMD,
+    int  MMV_IN,
+    int  MMV_OUT,
+    int unsigned  CNTR_BITWIDTH,
+    int unsigned  INCR_BITWIDTH,
+
+    int  LAST_READ_ELEM = $LAST_READ_ELEM$,
+    int  LAST_WRITE_ELEM = $LAST_WRITE_ELEM$,
+    int  BUF_ELEM_TOTAL = $BUF_ELEM_TOTAL$,
+    int  ELEM_PER_WINDOW = $ELEM_PER_WINDOW$
+)(
+    input   logic  ap_clk,
+    input   logic  ap_rst_n,
+
+    input   logic  in0_V_V_TVALID,
+    output  logic  in0_V_V_TREADY,
+    input   logic [BIT_WIDTH * SIMD * MMV_IN-1:0]  in0_V_V_TDATA,
+
+    output  logic  out_V_V_TVALID,
+    input   logic  out_V_V_TREADY,
+    output  logic [BIT_WIDTH * SIMD * MMV_OUT-1:0]  out_V_V_TDATA,
+
+    input logic                     cfg_valid,
+    input logic [CNTR_BITWIDTH-1:0] cfg_cntr_simd,
+    input logic [CNTR_BITWIDTH-1:0] cfg_cntr_kw,
+    input logic [CNTR_BITWIDTH-1:0] cfg_cntr_kh,
+    input logic [CNTR_BITWIDTH-1:0] cfg_cntr_w,
+    input logic [CNTR_BITWIDTH-1:0] cfg_cntr_h,
+    input logic [INCR_BITWIDTH-1:0] cfg_incr_head_simd,
+    input logic [INCR_BITWIDTH-1:0] cfg_incr_head_kw,
+    input logic [INCR_BITWIDTH-1:0] cfg_incr_head_kh,
+    input logic [INCR_BITWIDTH-1:0] cfg_incr_head_w,
+    input logic [INCR_BITWIDTH-1:0] cfg_incr_head_h,
+    input logic [INCR_BITWIDTH-1:0] cfg_incr_tail_w,
+    input logic [INCR_BITWIDTH-1:0] cfg_incr_tail_h,
+    input logic [INCR_BITWIDTH-1:0] cfg_incr_tail_last,
+    input logic [31:0]              cfg_last_read,
+    input logic [31:0]              cfg_last_write
+);
+    // derived constants
+    localparam int unsigned  BUF_IN_WIDTH = BIT_WIDTH * SIMD * MMV_IN;
+    localparam int unsigned  BUF_OUT_ELEM_WIDTH = BIT_WIDTH * SIMD;
+    localparam int unsigned  BUF_OUT_WIDTH = BIT_WIDTH * SIMD * MMV_OUT;
+
+    // (dynamic) configuration registers
+    logic [31:0] Cfg_last_read  = LAST_READ_ELEM;
+    logic [31:0] Cfg_last_write = LAST_WRITE_ELEM;
+
+    // configuration reset/set logic
+    always_ff @ (posedge ap_clk) begin
+        if(cfg_valid) begin
+            Cfg_last_read  <= cfg_last_read;
+            Cfg_last_write <= cfg_last_write;
+        end
+    end
+
+   // main buffer instantiation
+    uwire [BUF_IN_WIDTH -1:0]  window_buffer_in;
+    uwire [BUF_OUT_WIDTH-1:0]  window_buffer_out;
+    uwire  window_buffer_write_enable;
+    uwire  window_buffer_read_enable;
+    uwire [$clog2(BUF_ELEM_TOTAL)-1:0]  window_buffer_write_addr;
+    uwire [$clog2(BUF_ELEM_TOTAL)-1:0]  window_buffer_read_addr;
+    swg_cyclic_buffer_addressable #(
+        .WIDTH(BUF_IN_WIDTH),
+        .DEPTH(BUF_ELEM_TOTAL),
+        .RAM_STYLE($RAM_STYLE$)
+    ) window_buffer_inst (
+        .clk(ap_clk),
+
+        .write_enable(window_buffer_write_enable),
+        .write_addr(window_buffer_write_addr),
+        .data_in(window_buffer_in),
+
+        .read_enable(window_buffer_read_enable),
+        .read_addr(window_buffer_read_addr),
+        .data_out(window_buffer_out)
+    );
+
+    //controller instantiation
+    uwire  advance_controller;
+    uwire signed [INCR_BITWIDTH-1:0]  addr_incr;
+    uwire        [INCR_BITWIDTH-1:0]  tail_incr;
+    $TOP_MODULE_NAME$_controller #(
+        .CNTR_BITWIDTH(CNTR_BITWIDTH),
+        .INCR_BITWIDTH(INCR_BITWIDTH)
+    ) controller_inst (
+        .clk(ap_clk),
+        .rst_n(ap_rst_n),
+        .advance(advance_controller),
+        .addr_incr(addr_incr),
+        .tail_incr(tail_incr),
+
+        .cfg_valid(cfg_valid),
+        .cfg_cntr_simd(cfg_cntr_simd),
+        .cfg_cntr_kw(cfg_cntr_kw),
+        .cfg_cntr_kh(cfg_cntr_kh),
+        .cfg_cntr_w(cfg_cntr_w),
+        .cfg_cntr_h(cfg_cntr_h),
+        .cfg_incr_head_simd(cfg_incr_head_simd),
+        .cfg_incr_head_kw(cfg_incr_head_kw),
+        .cfg_incr_head_kh(cfg_incr_head_kh),
+        .cfg_incr_head_w(cfg_incr_head_w),
+        .cfg_incr_head_h(cfg_incr_head_h),
+        .cfg_incr_tail_w(cfg_incr_tail_w),
+        .cfg_incr_tail_h(cfg_incr_tail_h),
+        .cfg_incr_tail_last(cfg_incr_tail_last)
+    );
+
+    // Counters/address registers
+    // Add a sign bit even to (most) unsigned counters and Window_buffer_read_addr_reg,
+    // so we can use automatic sign extension and simplify calculations w/ signed increment.
+    // Alternatively, we could manually sign-extend and shave off a bit here or there.
+    logic signed [$clog2(LAST_READ_ELEM+1)+1-1:0]  Newest_buffered_elem = -1;
+    logic        [$clog2(LAST_READ_ELEM+1)+1-1:0]  Current_elem = 0;
+    logic        [$clog2(LAST_READ_ELEM+1)+1-1:0]  First_elem_next_window = 0;
+    logic        [$clog2(ELEM_PER_WINDOW)   -1:0]  Position_in_window = 0;
+    logic        [$clog2(BUF_ELEM_TOTAL)+1  -1:0]  Window_buffer_read_addr_reg = 0;
+    logic        [$clog2(BUF_ELEM_TOTAL)-1:0]      Window_buffer_write_addr_reg = 0;
+
+    // Control signals/registers
+    logic  Write_cmd    = 0;
+    logic  Writing_done = 0;
+    uwire  write_ok      = Write_cmd &&  out_V_V_TREADY;
+    uwire  write_blocked = Write_cmd && !out_V_V_TREADY;
+
+    logic  Fetching_done = 0;
+    uwire  fetch_cmd = !($signed(Current_elem) > Newest_buffered_elem) && !write_blocked && !Fetching_done;
+
+    uwire  reading_done = Newest_buffered_elem == Cfg_last_read;
+    uwire  read_cmd =
+        !reading_done && ( // if there is still an input element left to read
+            Fetching_done || ( // if fetching is done (e.g. for skipped rows at FM end due to stride)
+                $signed(((Newest_buffered_elem - (BUF_ELEM_TOTAL - 1)))) < $signed(First_elem_next_window) &&
+                $signed(((Newest_buffered_elem - (BUF_ELEM_TOTAL - 1)))) < $signed(Current_elem)
+            ) // (over-)write to buffer if oldest buffered element will no longer be needed
+        );
+    uwire  read_ok      = read_cmd && in0_V_V_TVALID;
+
+    //assign buffer control
+    assign  window_buffer_write_addr = Window_buffer_write_addr_reg;
+    assign  window_buffer_read_addr = Window_buffer_read_addr_reg;
+    assign  window_buffer_write_enable = read_ok;
+    assign  window_buffer_read_enable = fetch_cmd;
+    assign  advance_controller = fetch_cmd;
+
+    //assign I/O ports
+    assign  window_buffer_in = in0_V_V_TDATA;
+    assign  out_V_V_TDATA = window_buffer_out;
+    assign  in0_V_V_TREADY = ap_rst_n && read_ok; //only asserted if data is available and we can store it (allowed)
+    assign  out_V_V_TVALID = ap_rst_n && Write_cmd; //only asserted if we have data available and it has not been read yet (don't wait for READY from sink)
+
+    //main process for advancing counters
+    always_ff @(posedge ap_clk) begin
+        if(!ap_rst_n) begin
+            Newest_buffered_elem <= -1;
+            Current_elem <= 0;
+            First_elem_next_window <= 0;
+            Position_in_window <= 0;
+            Window_buffer_read_addr_reg <= 0;
+            Window_buffer_write_addr_reg <= 0;
+            Fetching_done <= 0;
+            Write_cmd <= 0;
+            Writing_done <= 0;
+        end
+        else begin
+            if (read_ok) begin
+                Window_buffer_write_addr_reg <= (Window_buffer_write_addr_reg == BUF_ELEM_TOTAL-1)? 0 : Window_buffer_write_addr_reg + 1;
+                Newest_buffered_elem <= Newest_buffered_elem+1;
+
+                if (Newest_buffered_elem == Cfg_last_read-1) begin
+                    Window_buffer_write_addr_reg <= 0;
+                end
+                //check if this is the last read cycle (reading_done will be true afterwards)
+                if ((Newest_buffered_elem == Cfg_last_read-1) && Writing_done) begin
+                    //start processing of next FM if writing is done already (possible due to unused input elements at the tail end)
+                    //todo: allow for read overlapping between feature maps (i.e., reading first elements from next FM while still writing last window of current FM)
+                    Newest_buffered_elem <= -1;
+                    Current_elem <= 0;
+                    Window_buffer_read_addr_reg <= 0;
+                    First_elem_next_window <= 0;
+                    Writing_done <= 0;
+                    Fetching_done <= 0;
+                end
+            end
+
+            if (fetch_cmd) begin
+                //count up to track which element index is about to be read from the buffer, and where it is located within the buffer
+                //use increment value calculated by controller
+
+                // absolute buffer address wrap-around
+                automatic logic signed [$clog2(BUF_ELEM_TOTAL)+1:0]  ra = $signed(Window_buffer_read_addr_reg) + $signed(addr_incr);
+                automatic logic signed [$clog2(BUF_ELEM_TOTAL+1):0]  ra_correct =
+                    (ra >= BUF_ELEM_TOTAL)? -BUF_ELEM_TOTAL :
+                    (ra <               0)?  BUF_ELEM_TOTAL : 0;
+                Window_buffer_read_addr_reg <= ra + ra_correct;
+
+                //keep track where we are within a window
+                Position_in_window <= (Position_in_window != ELEM_PER_WINDOW - 1)? Position_in_window+1 : 0;
+
+                //update first element of next window to allow buffer overwrite up until that point
+                if (Position_in_window == 0)
+                    First_elem_next_window <= First_elem_next_window + tail_incr;
+
+                //check if this is the last write cycle (Writing_done will be true afterwards)
+                if (Current_elem == Cfg_last_write)
+                    Fetching_done <= 1;
+                else
+                    Current_elem <= $signed(Current_elem) + addr_incr;
+
+                // determine if prefetched data will be outstanding in the next cycle
+                // if we fetch in this cycle -> yes
+                // if we do not fetch nor write -> do not change
+                // if we do not fetch but write successfully-> clear outstanding data
+                Write_cmd <= fetch_cmd;
+            end
+
+            if (write_ok)
+                Write_cmd <= fetch_cmd;
+
+            if (write_ok && Fetching_done) begin
+                //check if this is the last write cycle (Writing_done will be true afterwards)
+                if (reading_done || (read_ok && (Newest_buffered_elem == Cfg_last_read - 1))) begin
+                    //start processing of next FM if reading is done already, or completes in the same cycle
+                    Newest_buffered_elem <= -1;
+                    Current_elem <= 0;
+                    Window_buffer_read_addr_reg <= 0;
+                    First_elem_next_window <= 0;
+                    Fetching_done <= 0;
+                end else
+                    Writing_done <= 1;
+            end
+        end
+    end
+
+endmodule : $TOP_MODULE_NAME$_impl
diff --git a/finn-rtllib/swg/swg_template_parallel.sv b/finn-rtllib/swg/swg_template_parallel.sv
new file mode 100644
index 0000000000..83a525ff36
--- /dev/null
+++ b/finn-rtllib/swg/swg_template_parallel.sv
@@ -0,0 +1,228 @@
+/******************************************************************************
+ * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+module $TOP_MODULE_NAME$_wb
+#(
+    int unsigned IN_WIDTH          = 1, // bit-width*C*MMV_in
+    int unsigned OUT_ELEM_WIDTH    = 1, // bit-width*C
+    int unsigned OUT_WIDTH         = 1, // bit-width*C*MMV_out
+    int unsigned BUFFER_ELEM_TOTAL = 1
+)
+(
+    input logic clk,
+    input logic rst_n,
+    input logic shift_enable,
+    input logic [IN_WIDTH-1:0] data_in,
+    output logic [OUT_WIDTH-1:0] data_out
+);
+
+$GENERATE_REG_FIFOS$
+
+$GENERATE_BRAM_FIFOS$
+
+// fixed interconnect between linear buffers
+$GENERATE_BUFFER_CONNECTION$
+
+// fixed REG FIFO -> output mapping
+$GENERATE_OUTPUT_MAPPING$
+
+endmodule : $TOP_MODULE_NAME$_wb
+
+module $TOP_MODULE_NAME$_impl #(
+    int unsigned BIT_WIDTH,
+    int unsigned SIMD,
+    int unsigned MMV_IN,
+    int unsigned MMV_OUT,
+    int unsigned LAST_READ_ELEM = $LAST_READ_ELEM$,
+    int unsigned FIRST_WRITE_ELEM = $FIRST_WRITE_ELEM$,
+    int unsigned LAST_WRITE_ELEM = $LAST_WRITE_ELEM$,
+    int unsigned BUF_ELEM_TOTAL = $BUF_ELEM_TOTAL$,
+    int unsigned INCR_BITWIDTH = $INCR_BITWIDTH$
+)(
+    input   logic  ap_clk,
+    input   logic  ap_rst_n,
+
+    input   logic  in0_V_V_TVALID,
+    output  logic  in0_V_V_TREADY,
+    input   logic [BIT_WIDTH * SIMD * MMV_IN-1:0]  in0_V_V_TDATA,
+
+    output  logic  out_V_V_TVALID,
+    input   logic  out_V_V_TREADY,
+    output  logic [BIT_WIDTH * SIMD * MMV_OUT-1:0]  out_V_V_TDATA
+);
+    // derived constants
+    localparam int unsigned  BUF_IN_WIDTH = BIT_WIDTH * SIMD * MMV_IN;
+    localparam int unsigned  BUF_OUT_ELEM_WIDTH = BIT_WIDTH * SIMD;
+    localparam int unsigned  BUF_OUT_WIDTH = BIT_WIDTH * SIMD * MMV_OUT;
+
+    // main buffer instantiation
+    uwire [BUF_IN_WIDTH -1:0] window_buffer_in;
+    uwire [BUF_OUT_WIDTH-1:0] window_buffer_out;
+    uwire window_buffer_shift_enable;
+    $TOP_MODULE_NAME$_wb
+    #(
+        .IN_WIDTH(BUF_IN_WIDTH),
+        .OUT_ELEM_WIDTH(BUF_OUT_ELEM_WIDTH),
+        .OUT_WIDTH(BUF_OUT_WIDTH),
+        .BUFFER_ELEM_TOTAL(BUF_ELEM_TOTAL)
+    )
+    window_buffer_inst
+    (
+        .clk(ap_clk),
+        .rst_n(ap_rst_n),
+        .data_in(window_buffer_in),
+        .shift_enable(window_buffer_shift_enable),
+        .data_out(window_buffer_out)
+    );
+
+    // controller instantiation
+    uwire  advance_controller;
+    uwire signed [INCR_BITWIDTH-1:0]  addr_incr;
+    uwire        [INCR_BITWIDTH-1:0]  tail_incr;
+    swg_controller #(
+        .LOOP_H_ITERATIONS($LOOP_H_ITERATIONS$),
+        .LOOP_W_ITERATIONS($LOOP_W_ITERATIONS$),
+        .LOOP_KH_ITERATIONS($LOOP_KH_ITERATIONS$),
+        .LOOP_KW_ITERATIONS($LOOP_KW_ITERATIONS$),
+        .LOOP_SIMD_ITERATIONS($LOOP_SIMD_ITERATIONS$),
+        .HEAD_INCR_SIMD($HEAD_INCR_SIMD$),
+        .HEAD_INCR_KW($HEAD_INCR_KW$),
+        .HEAD_INCR_KH($HEAD_INCR_KH$),
+        .HEAD_INCR_W($HEAD_INCR_W$),
+        .HEAD_INCR_H($HEAD_INCR_H$),
+        .TAIL_INCR_W($TAIL_INCR_W$),
+        .TAIL_INCR_H($TAIL_INCR_H$),
+        .TAIL_INCR_LAST($TAIL_INCR_LAST$),
+        .INCR_BITWIDTH($INCR_BITWIDTH$),
+        .IS_DEPTHWISE($IS_DEPTHWISE$),
+        .INNERMOST_STATE(swg::$INNERMOST_STATE$)
+    )
+    controller_inst (
+        .clk(ap_clk),
+        .rst_n(ap_rst_n),
+        .advance(advance_controller),
+        .addr_incr(addr_incr),
+        .tail_incr(tail_incr)
+    );
+
+    // counters/address registers
+    logic signed [$clog2(LAST_READ_ELEM+1)+1-1:0]  Newest_buffered_elem = -1;
+    logic        [$clog2(LAST_READ_ELEM+1)+1-1:0]  Current_elem = FIRST_WRITE_ELEM;
+    logic        [$clog2(LAST_READ_ELEM+1)+1-1:0]  First_elem_next_window = 0;
+
+    // control registers/signals
+    logic  Writing_done  = 0;
+    logic  Write_done    = 0;
+    uwire  write_cmd     = !($signed(Current_elem) > Newest_buffered_elem) && !Writing_done;;
+    uwire  write_ok      = write_cmd && (out_V_V_TREADY || Write_done);
+    uwire  write_blocked = write_cmd && !out_V_V_TREADY && !Write_done;
+
+    uwire  reading_done = Newest_buffered_elem == LAST_READ_ELEM;
+    uwire  read_cmd     =
+        !reading_done && ( // if there is still an input element left to read
+            Writing_done || ( // if writing is done (e.g. for skipped rows at FM end due to stride)
+                $signed(((Newest_buffered_elem - ($signed(BUF_ELEM_TOTAL) - 1)))) < $signed(First_elem_next_window) &&
+                $signed(((Newest_buffered_elem - ($signed(BUF_ELEM_TOTAL) - 1)))) < $signed(Current_elem)
+            ) // (over-)write to buffer if oldest buffered element will no longer be needed
+        );
+    uwire  read_ok      = read_cmd && in0_V_V_TVALID && !write_blocked;
+
+    //            includes waiting on W    if W-only cycle: wait only on W     no R/W to wait for
+    uwire advance       = read_ok        ||   (!read_cmd && write_ok)    || (!read_cmd && !write_cmd);
+
+    // assign buffer control
+    assign window_buffer_shift_enable = advance;
+    assign  advance_controller = write_ok;
+
+    // assign I/O ports
+    assign  window_buffer_in = in0_V_V_TDATA;
+    assign  out_V_V_TDATA = window_buffer_out;
+    assign  in0_V_V_TREADY = ap_rst_n && read_ok; //only asserted if data is available and we can store it (allowed)
+    assign  out_V_V_TVALID = ap_rst_n && write_cmd && !Write_done; //only asserted if we have data available and it has not been read yet (don't wait for READY from sink)
+
+    // write done logic
+    always_ff @(posedge ap_clk) begin
+        if(!ap_rst_n) begin
+            Write_done <= 1'b0;
+        end
+        else begin
+            if (advance) begin
+                Write_done <= 1'b0; //reset flag
+            end else if (write_ok)  //successful W in this cycle, but R still outstanding
+                Write_done <= 1'b1; //write can happen even if read is blocked, but only for the current cycle!
+        end
+    end
+
+    // main process for advancing counters
+    always_ff @(posedge ap_clk) begin
+        if(!ap_rst_n) begin
+            Newest_buffered_elem <= -1;
+            Current_elem <= FIRST_WRITE_ELEM;
+            First_elem_next_window <= 0;
+            Writing_done <= 0;
+        end
+        else begin
+            if (read_ok) begin
+                Newest_buffered_elem <= Newest_buffered_elem+1;
+
+                // check if this is the last read cycle (reading_done will be true afterwards)
+                if ((Newest_buffered_elem == LAST_READ_ELEM-1) && Writing_done) begin
+                    // start processing of next FM if writing is done already (possible due to unused input elements at the tail end)
+                    // todo: allow for read overlapping between feature maps (i.e., reading first elements from next FM while still writing last window of current FM)
+                    Newest_buffered_elem <= -1;
+                    Current_elem <= FIRST_WRITE_ELEM;
+                    First_elem_next_window <= 0;
+                    Writing_done <= 0;
+                end
+            end
+
+            if (write_ok) begin
+                First_elem_next_window <= First_elem_next_window + tail_incr;
+
+                // check if this is the last write cycle (Writing_done will be true afterwards)
+                if (Current_elem == LAST_WRITE_ELEM) begin
+                    Writing_done <= 1;
+
+                    if (reading_done || (read_ok && (Newest_buffered_elem == LAST_READ_ELEM - 1))) begin
+                        // start processing of next FM if reading is done already, or completes in the same cycle
+                        Newest_buffered_elem <= -1;
+                        Current_elem <= FIRST_WRITE_ELEM;
+                        First_elem_next_window <= 0;
+                        Writing_done <= 0;
+                    end
+                end
+                else
+                    Current_elem <= $signed(Current_elem) + addr_incr;
+            end
+        end
+    end
+
+endmodule : $TOP_MODULE_NAME$_impl
diff --git a/finn-rtllib/swg/swg_template_wrapper.v b/finn-rtllib/swg/swg_template_wrapper.v
new file mode 100644
index 0000000000..11fa0a88cb
--- /dev/null
+++ b/finn-rtllib/swg/swg_template_wrapper.v
@@ -0,0 +1,72 @@
+/******************************************************************************
+ * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+module $TOP_MODULE_NAME$ (
+	(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V, ASSOCIATED_RESET ap_rst_n" *)
+	(* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk CLK" *)
+	input  ap_clk,
+	(* X_INTERFACE_PARAMETER = "POLARITY ACTIVE_LOW" *)
+	input  ap_rst_n,
+	input  [BUF_IN_WIDTH-1:0] in0_V_TDATA,
+	input  in0_V_TVALID,
+	output in0_V_TREADY,
+	output [BUF_OUT_WIDTH-1:0] out_V_TDATA,
+	output out_V_TVALID,
+	input  out_V_TREADY
+);
+
+// top-level parameters (set via code-generation)
+parameter BIT_WIDTH = $BIT_WIDTH$;
+parameter SIMD = $SIMD$;
+parameter MMV_IN = $MMV_IN$;
+parameter MMV_OUT = $MMV_OUT$;
+
+// derived constants
+parameter BUF_IN_WIDTH = BIT_WIDTH * SIMD * MMV_IN;
+parameter BUF_OUT_WIDTH = BIT_WIDTH * SIMD * MMV_OUT;
+
+$TOP_MODULE_NAME$_impl #(
+	.BIT_WIDTH(BIT_WIDTH),
+	.SIMD(SIMD),
+	.MMV_IN(MMV_IN),
+	.MMV_OUT(MMV_OUT)
+) impl (
+	.ap_clk(ap_clk),
+	.ap_rst_n(ap_rst_n),
+	.in0_V_V_TDATA(in0_V_TDATA),
+	.in0_V_V_TVALID(in0_V_TVALID),
+	.in0_V_V_TREADY(in0_V_TREADY),
+	.out_V_V_TDATA(out_V_TDATA),
+	.out_V_V_TVALID(out_V_TVALID),
+	.out_V_V_TREADY(out_V_TREADY)
+);
+
+endmodule : $TOP_MODULE_NAME$
diff --git a/finn-rtllib/swg/swg_template_wrapper_dynamic.v b/finn-rtllib/swg/swg_template_wrapper_dynamic.v
new file mode 100644
index 0000000000..5c09e7c1b4
--- /dev/null
+++ b/finn-rtllib/swg/swg_template_wrapper_dynamic.v
@@ -0,0 +1,181 @@
+/******************************************************************************
+ * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *	 this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *	 notice, this list of conditions and the following disclaimer in the
+ *	 documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *	 contributors may be used to endorse or promote products derived from
+ *	 this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+module $TOP_MODULE_NAME$ #(
+    // top-level parameters (set via code-generation)
+    parameter BIT_WIDTH = $BIT_WIDTH$,
+    parameter SIMD = $SIMD$,
+    parameter MMV_IN = $MMV_IN$,
+    parameter MMV_OUT = $MMV_OUT$,
+
+    parameter CNTR_BITWIDTH = $CNTR_BITWIDTH$,
+    parameter INCR_BITWIDTH = $INCR_BITWIDTH$,
+
+    // derived constants
+    parameter BUF_IN_WIDTH = BIT_WIDTH * SIMD * MMV_IN,
+    parameter BUF_OUT_WIDTH = BIT_WIDTH * SIMD * MMV_OUT,
+
+    parameter integer C_s_axilite_DATA_WIDTH	= 32,
+    parameter integer C_s_axilite_ADDR_WIDTH	= 6
+)
+(
+    (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V:s_axilite, ASSOCIATED_RESET ap_rst_n" *)
+    (* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk CLK" *)
+    input  ap_clk,
+    (* X_INTERFACE_PARAMETER = "POLARITY ACTIVE_LOW" *)
+    input  ap_rst_n,
+    input  [BUF_IN_WIDTH-1:0] in0_V_TDATA,
+    input  in0_V_TVALID,
+    output in0_V_TREADY,
+    output [BUF_OUT_WIDTH-1:0] out_V_TDATA,
+    output out_V_TVALID,
+    input  out_V_TREADY,
+
+    // Ports of Axi Slave Bus Interface s_axilite
+    input  [C_s_axilite_ADDR_WIDTH-1 : 0] s_axilite_awaddr,
+    input  [2 : 0] s_axilite_awprot,
+    input  s_axilite_awvalid,
+    output s_axilite_awready,
+    input  [C_s_axilite_DATA_WIDTH-1 : 0] s_axilite_wdata,
+    input  [(C_s_axilite_DATA_WIDTH/8)-1 : 0] s_axilite_wstrb,
+    input  s_axilite_wvalid,
+    output s_axilite_wready,
+    output [1 : 0] s_axilite_bresp,
+    output s_axilite_bvalid,
+    input  s_axilite_bready,
+    input  [C_s_axilite_ADDR_WIDTH-1 : 0] s_axilite_araddr,
+    input  [2 : 0] s_axilite_arprot,
+    input  s_axilite_arvalid,
+    output s_axilite_arready,
+    output [C_s_axilite_DATA_WIDTH-1 : 0] s_axilite_rdata,
+    output [1 : 0] s_axilite_rresp,
+    output s_axilite_rvalid,
+    input  s_axilite_rready
+);
+
+wire                     cfg_valid;
+wire [CNTR_BITWIDTH-1:0] cfg_cntr_simd;
+wire [CNTR_BITWIDTH-1:0] cfg_cntr_kw;
+wire [CNTR_BITWIDTH-1:0] cfg_cntr_kh;
+wire [CNTR_BITWIDTH-1:0] cfg_cntr_w;
+wire [CNTR_BITWIDTH-1:0] cfg_cntr_h;
+wire [INCR_BITWIDTH-1:0] cfg_incr_head_simd;
+wire [INCR_BITWIDTH-1:0] cfg_incr_head_kw;
+wire [INCR_BITWIDTH-1:0] cfg_incr_head_kh;
+wire [INCR_BITWIDTH-1:0] cfg_incr_head_w;
+wire [INCR_BITWIDTH-1:0] cfg_incr_head_h;
+wire [INCR_BITWIDTH-1:0] cfg_incr_tail_w;
+wire [INCR_BITWIDTH-1:0] cfg_incr_tail_h;
+wire [INCR_BITWIDTH-1:0] cfg_incr_tail_last;
+wire [31:0]              cfg_last_read;
+wire [31:0]              cfg_last_write;
+
+// Instantiation of Axi Bus Interface s_axilite
+$TOP_MODULE_NAME$_axilite # (
+    .C_S_AXI_DATA_WIDTH(C_s_axilite_DATA_WIDTH),
+    .C_S_AXI_ADDR_WIDTH(C_s_axilite_ADDR_WIDTH)
+) axilite_cfg_inst (
+    .S_AXI_ACLK(ap_clk),
+    .S_AXI_ARESETN(ap_rst_n),
+    .S_AXI_AWADDR(s_axilite_awaddr),
+    .S_AXI_AWPROT(s_axilite_awprot),
+    .S_AXI_AWVALID(s_axilite_awvalid),
+    .S_AXI_AWREADY(s_axilite_awready),
+    .S_AXI_WDATA(s_axilite_wdata),
+    .S_AXI_WSTRB(s_axilite_wstrb),
+    .S_AXI_WVALID(s_axilite_wvalid),
+    .S_AXI_WREADY(s_axilite_wready),
+    .S_AXI_BRESP(s_axilite_bresp),
+    .S_AXI_BVALID(s_axilite_bvalid),
+    .S_AXI_BREADY(s_axilite_bready),
+    .S_AXI_ARADDR(s_axilite_araddr),
+    .S_AXI_ARPROT(s_axilite_arprot),
+    .S_AXI_ARVALID(s_axilite_arvalid),
+    .S_AXI_ARREADY(s_axilite_arready),
+    .S_AXI_RDATA(s_axilite_rdata),
+    .S_AXI_RRESP(s_axilite_rresp),
+    .S_AXI_RVALID(s_axilite_rvalid),
+    .S_AXI_RREADY(s_axilite_rready),
+
+    .cfg_reg0(cfg_valid),
+    .cfg_reg1(cfg_cntr_simd),
+    .cfg_reg2(cfg_cntr_kw),
+    .cfg_reg3(cfg_cntr_kh),
+    .cfg_reg4(cfg_cntr_w),
+    .cfg_reg5(cfg_cntr_h),
+    .cfg_reg6(cfg_incr_head_simd),
+    .cfg_reg7(cfg_incr_head_kw),
+    .cfg_reg8(cfg_incr_head_kh),
+    .cfg_reg9(cfg_incr_head_w),
+    .cfg_reg10(cfg_incr_head_h),
+    .cfg_reg11(cfg_incr_tail_w),
+    .cfg_reg12(cfg_incr_tail_h),
+    .cfg_reg13(cfg_incr_tail_last),
+    .cfg_reg14(cfg_last_read),
+    .cfg_reg15(cfg_last_write)
+);
+
+$TOP_MODULE_NAME$_impl #(
+    .BIT_WIDTH(BIT_WIDTH),
+    .SIMD(SIMD),
+    .MMV_IN(MMV_IN),
+    .MMV_OUT(MMV_OUT),
+    .CNTR_BITWIDTH(CNTR_BITWIDTH),
+    .INCR_BITWIDTH(INCR_BITWIDTH)
+) impl (
+    .ap_clk(ap_clk),
+    .ap_rst_n(ap_rst_n),
+    .in0_V_V_TDATA(in0_V_TDATA),
+    .in0_V_V_TVALID(in0_V_TVALID),
+    .in0_V_V_TREADY(in0_V_TREADY),
+    .out_V_V_TDATA(out_V_TDATA),
+    .out_V_V_TVALID(out_V_TVALID),
+    .out_V_V_TREADY(out_V_TREADY),
+
+    .cfg_valid(cfg_valid),
+    .cfg_cntr_simd(cfg_cntr_simd),
+    .cfg_cntr_kw(cfg_cntr_kw),
+    .cfg_cntr_kh(cfg_cntr_kh),
+    .cfg_cntr_w(cfg_cntr_w),
+    .cfg_cntr_h(cfg_cntr_h),
+    .cfg_incr_head_simd(cfg_incr_head_simd),
+    .cfg_incr_head_kw(cfg_incr_head_kw),
+    .cfg_incr_head_kh(cfg_incr_head_kh),
+    .cfg_incr_head_w(cfg_incr_head_w),
+    .cfg_incr_head_h(cfg_incr_head_h),
+    .cfg_incr_tail_w(cfg_incr_tail_w),
+    .cfg_incr_tail_h(cfg_incr_tail_h),
+    .cfg_incr_tail_last(cfg_incr_tail_last),
+    .cfg_last_read(cfg_last_read),
+    .cfg_last_write(cfg_last_write)
+);
+
+endmodule : $TOP_MODULE_NAME$
diff --git a/notebooks/advanced/0_custom_analysis_pass.ipynb b/notebooks/advanced/0_custom_analysis_pass.ipynb
index a4ad32ed7f..f915b11fa0 100644
--- a/notebooks/advanced/0_custom_analysis_pass.ipynb
+++ b/notebooks/advanced/0_custom_analysis_pass.ipynb
@@ -52,7 +52,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "showInNetron(\"../LFCW1A1.onnx\")"
+    "import os\n",
+    "notebook_dir = os.environ['FINN_ROOT'] + \"/notebooks\"\n",
+    "showInNetron(notebook_dir + \"/LFCW1A1.onnx\")"
    ]
   },
   {
@@ -69,7 +71,7 @@
    "outputs": [],
    "source": [
     "from qonnx.core.modelwrapper import ModelWrapper\n",
-    "model = ModelWrapper('../LFCW1A1.onnx')"
+    "model = ModelWrapper(notebook_dir + \"/LFCW1A1.onnx\")"
    ]
   },
   {
@@ -137,7 +139,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
diff --git a/notebooks/advanced/1_custom_transformation_pass.ipynb b/notebooks/advanced/1_custom_transformation_pass.ipynb
index e40a534af5..7e4989c902 100644
--- a/notebooks/advanced/1_custom_transformation_pass.ipynb
+++ b/notebooks/advanced/1_custom_transformation_pass.ipynb
@@ -110,8 +110,11 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "import os\n",
+    "notebook_dir = os.environ['FINN_ROOT'] + \"/notebooks\"\n",
+    "\n",
     "import onnx\n",
-    "onnx_model = onnx.load('../LFCW1A1.onnx')\n",
+    "onnx_model = onnx.load(notebook_dir + \"/LFCW1A1.onnx\")\n",
     "from qonnx.core.modelwrapper import ModelWrapper\n",
     "onnx_model = ModelWrapper(onnx_model)"
    ]
@@ -122,7 +125,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "showInNetron('../LFCW1A1.onnx')"
+    "showInNetron(notebook_dir + \"/LFCW1A1.onnx\")"
    ]
   },
   {
@@ -233,7 +236,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
diff --git a/notebooks/advanced/2_custom_op.ipynb b/notebooks/advanced/2_custom_op.ipynb
index c27f8bdca7..636da64dd5 100644
--- a/notebooks/advanced/2_custom_op.ipynb
+++ b/notebooks/advanced/2_custom_op.ipynb
@@ -8,14 +8,14 @@
     "\n",
     "Suppose that you want to introduce a new (custom) operation type into the FINN compiler. Custom operations in FINN are useful for a variety of things ranging from code generation to functional verification. This is achieved by creating a new Python module for your custom operation that fulfills certain interface specifications.\n",
     "\n",
-    "One thing to point out before we start is that **these custom operations are generic** and not really tied to e.g. Vivado HLS or few-bit quantization. As you will see in this notebook, it's possible to provide arbitrary Python/C/C++/... execution and code generation paths for custom nodes.\n",
+    "One thing to point out before we start is that **these custom operations are generic** and not really tied to e.g. Vitis HLS or few-bit quantization. As you will see in this notebook, it's possible to provide arbitrary Python/C/C++/... execution and code generation paths for custom nodes.\n",
     "\n",
     "## The CustomOp base class\n",
     "\n",
     "Subclasses of `CustomOp` provide a way of providing custom functionality for ONNX nodes in FINN.\n",
     "This is the base class for every custom op node used in the framework, so you must create subclasses of `CustomOp` to provide execution, code generation and other functionalities in FINN.\n",
     "\n",
-    "Let's start by looking at the `CustomOp` base class itself, which lives in the `finn-base` repository. You can view it [here](https://github.com/Xilinx/finn-base/blob/dev/src/finn/custom_op/base.py). Note that the `finn` Docker container already has `finn-base` set up as a dependency.\n",
+    "Let's start by looking at the `CustomOp` base class itself, which lives in the `qonnx` repository. You can view it [here](https://github.com/fastmachinelearning/qonnx/blob/main/src/qonnx/custom_op/base.py). Note that the `finn` Docker container already has `qonnx` set up as a dependency.\n",
     "\n",
     "Some points of importance:\n",
     "\n",
@@ -23,7 +23,7 @@
     "\n",
     "2. `CustomOp` subclasses need to implement the methods below (those not starting with underscore).\n",
     "\n",
-    "3. To be discoverable in the custom op register, `CustomOp` subclasses must set the `domain` field to the name of the Python module they appear in. For instance, to use the custom `Im2Col` op type from [here](https://github.com/Xilinx/finn-base/blob/dev/src/finn/custom_op/general/im2col.py), the ONNX node must use `domain=qonnx.custom_op.general` since its module is located at `finn/custom_op/general/im2col.py`."
+    "3. To be discoverable in the custom op register, `CustomOp` subclasses must set the `domain` field to the name of the Python module they appear in. For instance, to use the custom `Im2Col` op type from [here](https://github.com/fastmachinelearning/qonnx/blob/main/src/qonnx/custom_op/general/im2col.py), the ONNX node must use `domain=qonnx.custom_op.general` since its module is located at `qonnx/custom_op/general/im2col.py`."
    ]
   },
   {
@@ -130,7 +130,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "To make sure our custom op is available, it needs to be registered. The best practice for this is to create a submodule under `finn.custom_op` which includes a `custom_op` dictionary that maps strings (op names) to classes (op implementations). Since we're in a Jupyter notebook we'll just hijack it at runtime like this:"
+    "To make sure our custom op is available, it needs to be registered. The best practice for this is to create a submodule under `qonnx.custom_op` which includes a `custom_op` dictionary that maps strings (op names) to classes (op implementations). Since we're in a Jupyter notebook we'll just hijack it at runtime like this:"
    ]
   },
   {
@@ -178,6 +178,7 @@
    "source": [
     "from qonnx.core.modelwrapper import ModelWrapper\n",
     "from onnx import TensorProto\n",
+    "from qonnx.util.basic import qonnx_make_model\n",
     "\n",
     "def make_graph(ishape, exp, op_type = \"MyPythonPowerOp\"):\n",
     "    inp = helper.make_tensor_value_info(\n",
@@ -204,7 +205,7 @@
     "    graph = helper.make_graph(\n",
     "        nodes=[custom_node], name=\"custom_graph\", inputs=[inp], outputs=[outp]\n",
     "    )\n",
-    "    model = helper.make_model(graph, producer_name=\"custom-model\")\n",
+    "    model = qonnx_make_model(graph, producer_name=\"custom-model\")\n",
     "    return ModelWrapper(model)"
    ]
   },
@@ -657,7 +658,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
diff --git a/notebooks/advanced/3_folding.ipynb b/notebooks/advanced/3_folding.ipynb
new file mode 100644
index 0000000000..07b66da52f
--- /dev/null
+++ b/notebooks/advanced/3_folding.ipynb
@@ -0,0 +1,664 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# FINN - Folding\n",
+    "--------------------------------------\n",
+    "**Note: We will utilize one of the intermediate models generated in the process of the cybersecurity end2end example**\n",
+    "\n",
+    "There is a local copy of `step_convert_to_hls.onnx` in this directory, which was renamed to `cybsec_PE_SIMD.onnx` to be able to go through this tutorial without requisites. But you can also generate it yourself with the [third cybersecurity Jupyter notebook](../end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb). After the execution of the estimates only build flow, it can be found in `../end2end_example/cybersecurity/output_estimates_only/intermediate_models/step_convert_to_hls.onnx`. \n",
+    "\n",
+    "This notebook describes the use of FINN parallelization parameters (PE & SIMD), also called folding factors, to efficiently optimize models so as to extract the maximum performance out of them. \n",
+    "\n",
+    "Please be aware that the folding factors can not be selected arbitrarily, each layer has constraints on which values the parallelization parameters can be set to, for more information see here: https://finn-dev.readthedocs.io/en/latest/internals.html#constraints-to-folding-factors-per-layer\n",
+    "\n",
+    "We'll use the utility function `showInNetron()` to visualize and interact with our network in the Jupyter Notebook and `showSrc()` to show source code of FINN library calls."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from finn.util.visualization import showInNetron, showSrc"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Note: The build_flow in the cybsec_mlp notebook comprises a transformation step `step_target_fps_parallelization` that automatically sets custom parallelization parameters needed to achieve a given `target_fps` by invoking the [`SetFolding` transformation](https://github.com/Xilinx/finn/blob/main/src/finn/transformation/fpgadataflow/set_folding.py#L46).\n",
+    "\n",
+    "More details of the above step can be found [here](https://github.com/Xilinx/finn/blob/main/src/finn/builder/build_dataflow_steps.py#L394)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This notebook shows the manual version of this step and explains how these attributes can improve performance and what are their effects on resource utilization for developers who need to maximize the performance of their network. \n",
+    "\n",
+    "For that we will use the `cybsec_PE_SIMD.onnx` file as starting point. This intermediate model from the cybersecurity example is the model representation after the high-level ONNX layers are converted to HLS layers. Each node in the graph now corresponds to an HLS C++ function call and the parallelization parameters can be set using the node attributes.\n",
+    "\n",
+    "We will take this model to show how to set the folding factors manually and analyze the estimated execution clock cycles and the resource utilization of each layer in the network."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### FINN-style Dataflow Architectures <a id='dataflow_arch'></a>\n",
+    "\n",
+    "We start with a quick recap of FINN-style dataflow architectures. The key idea in such architectures is to parallelize across layers as well as within layers by dedicating a proportionate amount of compute resources to each layer, as illustrated in the figure below.\n",
+    "\n",
+    "![](finn-dataflow.png)\n",
+    "\n",
+    "In practice, the layers are instantiated by function calls to optimized Vitis HLS building blocks from the [finn-hlslib](https://github.com/Xilinx/finn-hlslib) library.\n",
+    "\n",
+    "Since each layer will be instantiated, we can flexibly set the parallelization of each layer and thus control resources and throughput of our network, as visualized in the image below:\n",
+    "\n",
+    "![](finn-folding.png)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Part-1 : Loading the ONNX model.\n",
+    "\n",
+    "As discussed above, the network needs to go through a few preparation steps before it can be fed into our estimation functions.\n",
+    "\n",
+    "The `.onnx` file loaded here is taken from the cybersecurity end2end example notebook. \n",
+    "We pick the onnx file `cybsec_PE_SIMD.onnx` to which the necessary transformations have been applied for this notebook. This means, network layers mapped to necessary FINN-HLS blocks. In this case, the `MatrixVectorActivation` units. \n",
+    "\n",
+    "To interact with the `.onnx` file we use `ModelWrapper()`. This wrapper simplifies the access to different model attributes and allows us to apply custom transformations on the model.\n",
+    "\n",
+    "In the below cell, we load our onnx file and view the cybersecurity MLP network in Netron."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "from qonnx.core.modelwrapper import ModelWrapper\n",
+    "model_path = os.environ[\"FINN_ROOT\"] + \"/notebooks/advanced/cybsec_PE_SIMD.onnx\" \n",
+    "model = ModelWrapper(model_path)\n",
+    "\n",
+    "showInNetron(model_path)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Part 2 : Parallelization Parameters: PE & SIMD"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The computational parallelism can be varied by setting the folding factors or also called parallelization parameters **PE** and **SIMD** of each layer. These parallelization attributes are subject to certain constraints and should be selected accordingly.\n",
+    "\n",
+    "To see more details about how this is implemented in the `MatrixVectorActivation` layer (MVAU), please have a look at [this documentation](https://github.com/Xilinx/finn/blob/github-pages/docs/finn-sheduling-and-folding.pptx). A schematic of the folding in an MVAU for a fully-connected layer is shown below:\n",
+    "\n",
+    "![](finn-folding-mvau.png)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In the case of the MVAU, `PE` & `SIMD` are subject to the following constraints: \n",
+    "\n",
+    "If `MW` is the number of input features and `MH` the number of output features:\n",
+    "\n",
+    "        MW % SIMD == 0\n",
+    "        MH % PE == 0\n",
+    "        \n",
+    "Total folding in the case of the MVAU is defined as:\n",
+    "\n",
+    "    Total folding = (MH/PE) x (MW/SIMD)\n",
+    "\n",
+    "In a streaming dataflow architecture like it is in FINN designs the throughput is determined by the slowest layer. So, the goal of adjusting these parameters is to get an almost balanced pipeline i.e. equalizing the throughput rate of layers in the generated dataflow architecture.\n",
+    "\n",
+    "The FINN compiler provides analysis passes to facilitate the exploration of the folding factors of each layer. In this notebook we will show how to use these functions and explore how the parallelization parameters affect the clock cycles and the resource utilization of the generated dataflow architecture.\n",
+    "\n",
+    "We start with a naive case where `PE` & `SIMD` values across all layers are 1, this is the starting point of our exploration and is also the state the network is in after the conversion to HLS layers. If you take a look at the model using Netron and click on one of the MVAU layers, you can see that `PE` and `SIMD` are both set to 1 by default."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "showInNetron(model_path)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We import the analysis passes  `exp_cycles_per_layer()` and  `res_estimation()` to estimate the number of clock cycles and resource utilization of each network layer."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer\n",
+    "from finn.analysis.fpgadataflow.res_estimation import res_estimation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Analysis passes in FINN return information about the model in form of a dictionary, you can learn more about analysis passes in general in this Jupyter notebook: [0_custom_analysis_pass.ipynb](0_custom_analysis_pass.ipynb).\n",
+    "\n",
+    "We start by calling the analysis pass `exp_cycles_per_layer()`, which returns a dictionary with the layer names as keys and the expected cycles as values. Afterwards, we plot the result in a block diagram."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cycles_dict = model.analysis(exp_cycles_per_layer)\n",
+    "cycles_dict"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "fig = plt.figure(figsize = (10, 5))\n",
+    "plt.bar(cycles_dict.keys(), cycles_dict.values(), color ='blue', width = 0.3)\n",
+    "plt.xlabel(\"Network layers\")\n",
+    "plt.ylabel(\"Number of clock cycles\")\n",
+    "plt.title(\"Clock cycles per layer PE=SIMD=1\")\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We observe that the bottleneck in the execution of the model on hardware would come from the execution of the first layer which takes estimated 38400 clock cycles to execute one set of its inputs.\n",
+    "\n",
+    "No matter how quickly the other layers execute, the throughput will be defined by the first layer's execution latency.\n",
+    "\n",
+    "Let's have a look now at the estimated resources per layer by calling another analysis pass.\n",
+    "The keys are again the layer names, but the values are now a dictionary with the resource estimates per layer."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "res_dict = model.analysis(res_estimation)\n",
+    "res_dict"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Next to the absolute numbers of LUTs, BRAM, URAM and DSPs, the analysis pass also provides information about the efficiency of the memory usage. If the memory type is not utilized, the efficiency is by default 1. You can see that above for the `URAM_efficiency`. In all other cases the efficiency indicates the actual parameter storage needed divided by the allocated BRAM/URAM storage. So, this means in our example MVAU_0 uses 5 block ram and they are 83% utilized. "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "After we extract that information from the model, we plot the number of LUTs. In this notebook we concentrate on the influence on the LUT usage, but by manipulating the code below, you can also extract information about memory and dsp usage."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Extracting LUTs from res_dict\n",
+    "LUTs = [res_dict[key][\"LUT\"] for key in res_dict.keys()]   \n",
+    "\n",
+    "#Plotting the bar graph of each network layer with their corresponding LUT resource utilization\n",
+    "fig = plt.figure(figsize = (10, 5))\n",
+    "plt.bar(res_dict.keys(), LUTs, color ='green', width = 0.3)\n",
+    "plt.xlabel(\"Network layers\")\n",
+    "plt.ylabel(\"Number of LUTs\")\n",
+    "plt.title(\"No. of LUTs per layer PE=SIMD=1\")\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Since we identified above that the first layer takes the highest number of cycles to complete the execution, we will now try to adjust the folding parameters to reduce its latency at the expense of an increase in resource utilization."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Modify Parameters\n",
+    "\n",
+    "We now modify the parallelization parameters of the first network layer to reduce its latency.\n",
+    "We only extract the first `MatrixVectorActivation` block from the model and set the parallelization parameters manually.\n",
+    "\n",
+    "In the first step, we left the `PE` & `SIMD` values for all the layers on default (=1) to establish a baseline and measure the estimated clock cycles and resource utilization for each of the individual layers.\n",
+    "\n",
+    "To set `PE` & `SIMD`, we will utilize functionality from the FINN compiler. Each layer type has a Python wrapper which can be instantiated using the `getCustomOp()` function. The wrapper offers several helper functions like `get_nodeattr()` and `set_nodeattr()` to access and set the attributes of a node."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from qonnx.custom_op.registry import getCustomOp\n",
+    "\n",
+    "list_of_mvaus = model.get_nodes_by_op_type(\"MatrixVectorActivation\")\n",
+    "mvau0 = list_of_mvaus[0]\n",
+    "\n",
+    "mvau0_inst = getCustomOp(mvau0)\n",
+    "\n",
+    "# Get the node attributes to check the current setting\n",
+    "print(\"The parallelization parameters of %s were: \" % mvau0.name)\n",
+    "print(\"PE: \" + str(mvau0_inst.get_nodeattr(\"PE\")))\n",
+    "print(\"SIMD: \" + str(mvau0_inst.get_nodeattr(\"SIMD\")))\n",
+    "\n",
+    "# Set the new node attributes\n",
+    "mvau0_inst.set_nodeattr(\"PE\", 2)\n",
+    "mvau0_inst.set_nodeattr(\"SIMD\", 5)\n",
+    "\n",
+    "# Get the node attributes to check the updated setting\n",
+    "print(\"The parallelization parameters of %s are updated to: \" % mvau0.name)\n",
+    "print(\"PE: \" + str(mvau0_inst.get_nodeattr(\"PE\")))\n",
+    "print(\"SIMD: \" + str(mvau0_inst.get_nodeattr(\"SIMD\")))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We save the model and view it. On expanding the first `MatrixVectorActivation` we can see the updated `PE` & `SIMD` parameters for that layer."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.save(\"cybsec_PE_SIMD_modified.onnx\")\n",
+    "showInNetron(\"cybsec_PE_SIMD_modified.onnx\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "From the above total folding formula, we have reduced the total folding of our layer from `600 x 64` to `120 x 32`. Hence, resulting in an estimated `10x` decrease in the execution latency of our layer. \n",
+    "This can be observed in the new estimated clock cycles."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cycles_dict_updated = model.analysis(exp_cycles_per_layer)\n",
+    "cycles_dict_updated"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig = plt.figure(figsize = (10, 5))\n",
+    "plt.bar(cycles_dict_updated.keys(), cycles_dict_updated.values(), color ='blue', width = 0.3)\n",
+    "plt.xlabel(\"Network layers\")\n",
+    "plt.ylabel(\"Number of clock cycles\")\n",
+    "plt.title(\"Clock cycles per layer with updated folding factors\")\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This has of course consequences for the resource usage of the network."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "res_dict_updated = model.analysis(res_estimation)\n",
+    "res_dict_updated"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Extracting LUTs from res_dict\n",
+    "LUTs_updated = [res_dict_updated[key][\"LUT\"] for key in res_dict_updated.keys()]   \n",
+    "\n",
+    "#Plotting the bar graph of each network layer with their corresponding LUT resource utilization\n",
+    "fig = plt.figure(figsize = (10, 5))\n",
+    "plt.bar(res_dict_updated.keys(), LUTs_updated, color ='green', width = 0.3)\n",
+    "plt.xlabel(\"Network Layers\")\n",
+    "plt.ylabel(\"LUT Utilisation\")\n",
+    "plt.title(\"No. of LUTs per layer with updated folding factors\")\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "From these numbers, we see that the first layer has been removed as the bottleneck and that the entire network can now perform one inference in ~4096 clock cycles (when the pipeline is full) as compared to the earlier configuration where it took ~38400 execution cycles.\n",
+    "\n",
+    "This decrease in execution latency of the network though comes at a cost of a 45% increase in LUT resource utilization for the first layer of the network."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Important Note : StreamingDataWidthConverters"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Next to resources and performance, folding factors (or parallelization parameters) are influencing also other properties of the generated design. Since we are able to generate results in parallel, the data that gets fed into the layer needs to be packed in a specific format to provide the correct data at the correct time for the internal parallelism. Also, the data that comes out of a layer will be in a specific format depending on the internal parallelism."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "To analyze the influence of the folding factors on the data streams between layers, we first will import the original model (with `PE=SIMD=1`) and then we will import the updated model, so that we can compare the two of them."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dir_path = os.environ[\"FINN_ROOT\"] + \"/notebooks/advanced/\" \n",
+    "model_orig = ModelWrapper(dir_path + \"cybsec_PE_SIMD.onnx\")\n",
+    "model_updated = ModelWrapper(\"cybsec_PE_SIMD_modified.onnx\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In the next step we extract the information from all layers. For MVAUs the input shape is (1, MW/SIMD, SIMD) and the output shape is (1, MH/PE, PE)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Original model\n",
+    "list_of_mvaus = model_orig.get_nodes_by_op_type(\"MatrixVectorActivation\")\n",
+    "print(\"In the original model (pe=simd=1): \")\n",
+    "for mvau in list_of_mvaus:\n",
+    "    mvau_inst = getCustomOp(mvau)\n",
+    "    print(\"Layer: \" + mvau.name)\n",
+    "    print(\"Input shape: \" + str(mvau_inst.get_folded_input_shape()))\n",
+    "    print(\"Output shape: \" + str(mvau_inst.get_folded_output_shape()))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Updated model\n",
+    "list_of_mvaus = model_updated.get_nodes_by_op_type(\"MatrixVectorActivation\")\n",
+    "print(\"In the original model (pe=simd=1): \")\n",
+    "for mvau in list_of_mvaus:\n",
+    "    mvau_inst = getCustomOp(mvau)\n",
+    "    print(\"Layer: \" + mvau.name)\n",
+    "    print(\"Input shape: \" + str(mvau_inst.get_folded_input_shape()))\n",
+    "    print(\"Output shape: \" + str(mvau_inst.get_folded_output_shape()))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We can see that the input and output shape for MatrixVectorActivation_0 has changed after we have changed the folding factors. These changes have direct influence on the in/out stream width. We can have a closer look at the formula to calculate the stream width of an MVAU."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "showSrc(mvau_inst.get_instream_width)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "showSrc(mvau_inst.get_outstream_width)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The input stream width can be calculated by multiplying the input bit width with SIMD and the output stream width can be calculated by multiplying the output bit width with PE."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "To connect two layers with each other for the final design, the input stream width of a node needs to match the output stream width of the preceding node. If that is not the case FINN inserts DataWidthConverters (DWCs) to resolve this mismatch. Let's have a look at the input/output stream width of the layers before updating the parallelization parameters."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Original model\n",
+    "list_of_mvaus = model_orig.get_nodes_by_op_type(\"MatrixVectorActivation\")\n",
+    "print(\"In the original model (pe=simd=1): \")\n",
+    "for mvau in list_of_mvaus:\n",
+    "    mvau_inst = getCustomOp(mvau)\n",
+    "    print(\"Layer: \" + mvau.name)\n",
+    "    print(\"Input stream width: \" + str(mvau_inst.get_instream_width()))\n",
+    "    print(\"Output stream width: \" + str(mvau_inst.get_outstream_width()))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In the original model the output stream width of one layer matches the input stream width of the following layer. So there would be no DWC required when generating the final design."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "For the updated model, the situation is different. Let's have a look how the stream widths have changed."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Updated model\n",
+    "list_of_mvaus = model_updated.get_nodes_by_op_type(\"MatrixVectorActivation\")\n",
+    "print(\"In the original model (pe=simd=1): \")\n",
+    "for mvau in list_of_mvaus:\n",
+    "    mvau_inst = getCustomOp(mvau)\n",
+    "    print(\"Layer: \" + mvau.name)\n",
+    "    print(\"Input stream width: \" + str(mvau_inst.get_instream_width()))\n",
+    "    print(\"Output stream width: \" + str(mvau_inst.get_outstream_width()))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "As we can see, the output stream width of MatrixVectorActivation_0 has now changed to `4`, while the input stream width of MatrixVectorActivation_1 stayed `2`. So, the FINN compiler would insert a DWC between these nodes, we can manually invoke this behavior by calling the transformation `InsertDWC` on our model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from finn.transformation.fpgadataflow.insert_dwc import InsertDWC\n",
+    "from qonnx.transformation.general import GiveUniqueNodeNames\n",
+    "\n",
+    "model_updated = model_updated.transform(InsertDWC())\n",
+    "model_updated = model_updated.transform(GiveUniqueNodeNames())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_updated.save(\"cybsec_DWC.onnx\")\n",
+    "showInNetron(\"cybsec_DWC.onnx\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We can observe in the model that a DWC was inserted between the first two layers.\n",
+    "Since the DWC will also be a hardware block in our final FINN design, it has a latency and resources associated with it. Let's have a final look in our resource estimates."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_dwc = ModelWrapper(\"cybsec_DWC.onnx\")\n",
+    "res_dict_dwc = model_dwc.analysis(res_estimation)\n",
+    "res_dict_dwc"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Since we have now one additional layer, we manipulate the data to shorten the layer names in the plot."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "layers = res_dict_dwc.keys()\n",
+    "# replace names of layers with abbreviations\n",
+    "layers = [n.replace(\"MatrixVectorActivation_\", \"MVU\") for n in layers]\n",
+    "layers = [n.replace(\"StreamingDataWidthConverter_Batch\", \"DWC\") for n in layers]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Extracting LUTs from res_dict\n",
+    "LUTs_dwc = [res_dict_dwc[key][\"LUT\"] for key in res_dict_dwc.keys()]   \n",
+    "\n",
+    "#Plotting the bar graph of each network layer with their corresponding LUT resource utilization\n",
+    "fig = plt.figure(figsize = (10, 5))\n",
+    "plt.bar(layers, LUTs_dwc, color ='red', width = 0.3)\n",
+    "plt.xlabel(\"Network Layers\")\n",
+    "plt.ylabel(\"LUT Utilisation\")\n",
+    "plt.title(\"Estimated LUT values used for each network layer\")\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In the case of our example network, the `StreamingDataWidthConverter_Batch` layer does not consume a large number of LUT resources as shown in the graph. This might be different for larger models and if there are a higher number of DWCs inserted. Please be aware of this when setting the folding factors for your network."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/notebooks/advanced/4_advanced_builder_settings.ipynb b/notebooks/advanced/4_advanced_builder_settings.ipynb
new file mode 100644
index 0000000000..e748d85a1c
--- /dev/null
+++ b/notebooks/advanced/4_advanced_builder_settings.ipynb
@@ -0,0 +1,1642 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "8fcff912",
+   "metadata": {},
+   "source": [
+    "# Advanced Builder settings\n",
+    "\n",
+    "<img align=\"left\" src=\"../end2end_example/cybersecurity/finn-example.png\" alt=\"drawing\" style=\"margin-right: 20px\" width=\"250\"/>\n",
+    "\n",
+    "In this notebook, we'll use the FINN compiler to generate an FPGA accelerator with a streaming dataflow architecture from a small convolutional network trained on CIFAR-10. The key idea in streaming dataflow architectures is to parallelize across layers as well as within layers by dedicating a proportionate amount of compute resources to each layer, illustrated on the figure to the left. You can read more about the general concept in the [FINN](https://arxiv.org/pdf/1612.07119) and [FINN-R](https://dl.acm.org/doi/pdf/10.1145/3242897) papers. This is done by mapping each layer to a Vitis HLS description, parallelizing each layer's implementation to the appropriate degree and using on-chip FIFOs to link up the layers to create the full accelerator.\n",
+    "These implementations offer a good balance of performance and flexibility, but building them by hand is difficult and time-consuming. This is where the FINN compiler comes in: it can build streaming dataflow accelerators from an ONNX description to match the desired throughput."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a830e730",
+   "metadata": {},
+   "source": [
+    "In this tutorial, we will have a more detailed look into the FINN builder tool and explore different options to customize your FINN design. We assume that you have already completed the [Cybersecurity notebooks](../end2end_example/cybersecurity) and that you have a basic understanding of how the FINN compiler works and how to use the FINN builder tool."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5ec9a0db",
+   "metadata": {},
+   "source": [
+    "## Outline\n",
+    "---------------\n",
+    "\n",
+    "1. [Introduction to the CNV-w2a2 network](#intro_cnv)\n",
+    "2. [Recap default builder flow](#recap_builder)\n",
+    "3. [Build steps](#build_step)\n",
+    "    1. [How to make a custom build step](#custom_step)\n",
+    "4. [Folding configuration json](#folding_config)\n",
+    "5. [Additional builder arguments](#builder_arg)\n",
+    "    1. [Verification steps](#verify)\n",
+    "    2. [Other builder arguments](#other_args)\n",
+    "    3. [Examples for additional builder arguments & bitfile generation](#example_args)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5dbed63f",
+   "metadata": {},
+   "source": [
+    "## Introduction to the CNV-w2a2 network <a id=\"intro_cnv\"></a>\n",
+    "\n",
+    "The particular quantized neural network (QNN) we will be targeting in this notebook is referred to as CNV-w2a2 and it classifies 32x32 RGB images into one of ten CIFAR-10 classes. All weights and activations in this network are quantized to two bit, with the exception of the input (which is RGB with 8 bits per channel) and the final output (which is 32-bit numbers). It is similar to the convolutional neural network used in the [cnv_end2end_example](../end2end_example/bnn-pynq/cnv_end2end_example.ipynb) Jupyter notebook.\n",
+    "\n",
+    "\n",
+    "You'll have a chance to interactively examine the layers that make up the network in Netron. We start by setting the build directory to the directory this notebook is in and importing helper functions to use in the notebook to examine ONNX graphs and source code."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ce459f3c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from finn.util.visualization import showInNetron, showSrc\n",
+    "import os\n",
+    "    \n",
+    "build_dir = os.environ['FINN_ROOT'] + \"/notebooks/advanced\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7fc6444c",
+   "metadata": {},
+   "source": [
+    "In the next step, we will export the trained network directly from Brevitas to the QONNX format. QONNX is the intermediate representation (IR) that is used as the frontend to the FINN compiler. Please note that the internal representation of the network is still the FINN-ONNX format. [QONNX and FINN-ONNX](https://finn.readthedocs.io/en/latest/internals.html#intermediate-representation-qonnx-and-finn-onnx) are extensions to the ONNX format to represent quantization, especially below 8 bit, in ONNX graphs. The main difference is that quantization in QONNX graphs is represented using dedicated quantization nodes ([more about QONNX](https://github.com/fastmachinelearning/qonnx)) while the quantization in FINN-ONNX is an annotation attached to the tensors."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fe262964",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "from finn.util.test import get_test_model_trained\n",
+    "from brevitas.export import export_qonnx\n",
+    "from qonnx.util.cleanup import cleanup as qonnx_cleanup\n",
+    "\n",
+    "cnv = get_test_model_trained(\"CNV\", 2, 2)\n",
+    "export_onnx_path = build_dir + \"/end2end_cnv_w2a2_export.onnx\"\n",
+    "export_qonnx(cnv, torch.randn(1, 3, 32, 32), export_onnx_path)\n",
+    "qonnx_cleanup(export_onnx_path, out_file=export_onnx_path)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d24b632f",
+   "metadata": {},
+   "source": [
+    "After the export, we call a clean up function on the model. This makes sure, that for example all shapes in the network are inferred, constant folding was applied and all tensors and nodes have unique names. In the next step, we can visualize the graph using Netron. When scrolling through the graph, you can see the Quant nodes that indicate the quantization in the network. In the [first step](https://github.com/Xilinx/finn/blob/main/src/finn/builder/build_dataflow_steps.py#L260) of the FINN builder flow, the network gets converted from the QONNX format to the FINN-ONNX format. That means these Quant nodes will not be present in the graph anymore and instead the quantization will be attached as an annotation to the tensors."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "87f59da6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "showInNetron(build_dir+\"/end2end_cnv_w2a2_export.onnx\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c764ed76",
+   "metadata": {},
+   "source": [
+    "## Quick recap, how to setup up default builder flow for resource estimations <a id=\"recap_builder\"></a>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a26e5418",
+   "metadata": {},
+   "source": [
+    "As a quick recap, let's set up the builder like we have done in the cybersecurity example to get the resource estimates for our example network."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9007705a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## Quick recap on how to setup the default builder flow for resource estimations\n",
+    "\n",
+    "import finn.builder.build_dataflow as build\n",
+    "import finn.builder.build_dataflow_config as build_cfg\n",
+    "import os\n",
+    "import shutil\n",
+    "\n",
+    "model_dir = os.environ['FINN_ROOT'] + \"/notebooks/advanced\"\n",
+    "model_file = model_dir + \"/end2end_cnv_w2a2_export.onnx\"\n",
+    "\n",
+    "estimates_output_dir = build_dir + \"/output_estimates_only\"\n",
+    "\n",
+    "#Delete previous run results if exist\n",
+    "if os.path.exists(estimates_output_dir):\n",
+    "    shutil.rmtree(estimates_output_dir)\n",
+    "    print(\"Previous run results deleted!\")\n",
+    "\n",
+    "\n",
+    "cfg_estimates = build.DataflowBuildConfig(\n",
+    "    output_dir          = estimates_output_dir,\n",
+    "    mvau_wwidth_max     = 80,\n",
+    "    target_fps          = 10000,\n",
+    "    synth_clk_period_ns = 10.0,\n",
+    "    fpga_part           = \"xc7z020clg400-1\",\n",
+    "    steps               = build_cfg.estimate_only_dataflow_steps,\n",
+    "    generate_outputs=[\n",
+    "        build_cfg.DataflowOutputType.ESTIMATE_REPORTS,\n",
+    "    ]\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "02e4c0f0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "build.build_dataflow_cfg(model_file, cfg_estimates);"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4fa0b9f5",
+   "metadata": {},
+   "source": [
+    "The output directory was created and we can extract information about our model and also how it was processed in the FINN compiler from the generated files. Let's focus on the intermediate models for now. You can find them in the output directory in the folder \"intermediate_models\"."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "05a941ef",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!ls -t -r {build_dir}/output_estimates_only/intermediate_models"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d746eff3",
+   "metadata": {},
+   "source": [
+    "After each FINN builder step, the graph is saved as .onnx file. In the cell above we sort the intermediate models by time in descending order (`ls -t -r`) to visualize the builder flow. As you can see after the conversion to the FINN-ONNX format (`step_qonnx_to_finn`), the graph is prepared by tidy up and streamlining (`step_tidy_up` and `step_streamline`) and then the high level nodes are converted to HLS layers (`step_convert_to_hls`). Then there is a partition created from all layers that were converted to HLS layers (`step_create_dataflow_partition`), then optimizations are applied (`step_target_fps_parallelization`, `step_apply_folding_config` and `step_minimize_bit_width`). In the final step of this example we generate resource and performance reports for the network (`step_generate_estimate_reports`). Use the code below to investigate the network after each step."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "72de8d4c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_to_investigate = \"step_qonnx_to_finn.onnx\"\n",
+    "showInNetron(build_dir+\"/output_estimates_only/intermediate_models/\"+model_to_investigate)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "bccebd0d",
+   "metadata": {},
+   "source": [
+    "The analysis of these .onnx files can help us identifying points in the flow in which we might need to intervene and provide the compiler with additional information. When investigating the network after the conversion to HLS layers, we can see that there are layers that were not converted. We can see this by clicking on the different nodes. HLS layers have the module `finn.custom_op.fpgadataflow`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6d86463a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "showInNetron(build_dir+\"/output_estimates_only/intermediate_models/step_convert_to_hls.onnx\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2719cc09",
+   "metadata": {},
+   "source": [
+    "As you can see in the graph, the first two nodes (a MultiThreshold and Transpose node) and the last two nodes (a Mul and Add node) are not converted into HLS layers. FINN currently only converts integer only operations into HLS layers, this means only when the input, output & weights are quantized to integer the node will be converted."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ff7fa549",
+   "metadata": {},
+   "source": [
+    "<div class=\"alert alert-block alert-info\">\n",
+    "<b>Important notice:</b> We are working on supporting additional data types and this limitation might disappear in the near future.\n",
+    "</div>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6e6d942e",
+   "metadata": {},
+   "source": [
+    "When we click on the `global_in` in the graph, we can see that the quantization annotation does not contain a data type. If no data type is set and it can not be derived from the preceeding node, the FINN compiler automatically assumes that the data type is floating point. This is why the first node does not get converted into an HLS layer, the input is assumed to be floating point."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8b8994e6",
+   "metadata": {},
+   "source": [
+    "The solution to the problem depends on the actual data input.\n",
+    "1. The data set is quantized and `global_in` is an integer: We set the data type of the tensor `global_in` before passing the model to the FINN compiler using [helper functions of ModelWrapper](https://finn.readthedocs.io/en/latest/internals.html#helper-functions-for-tensors).\n",
+    "2. The data set is not quantized: we can either execute the first layer in software (e.g. as part of the Python driver) or we can add a preprocessing step into the graph."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7504dce7",
+   "metadata": {},
+   "source": [
+    "Even though in the example of the CNVw2a2, the inputs are 32x32 RGB images, so the input values are 8 bit (UINT8) \"quantized\", the input to the exported model is floating point. For training in Brevitas, these values were normalized between 0 and 1.0 and so the exported model expects floating point values as input. \n",
+    "This means we are in scenario 2. In the next section we will develop a custom step for the FINN builder flow to add preprocessing to our network.\n",
+    "\n",
+    "But before we move to the next section, let's take a look at the last two nodes in the graph that were not converted to HLS layers."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f9c2696b",
+   "metadata": {},
+   "source": [
+    "We have two nodes at the end of the graph that we were not able to convert: a floating poing scalar multiplication and addition. These operations are \"left-over\" from streamlining and cannot be merged into a succeeding thresholding operation. \n",
+    "\n",
+    "Our example is a network for image classification, so the output is a vector of 10 values that give a predicition score for each of the classes in the CIFAR-10 data set. If we are only interested in the Top-1 result of the classification, we can add a post-processing step which inserts a TopK node in the graph. \n",
+    "\n",
+    "Since the last two layers are scalar operations, they have the same influence on all predicition scores in the output vector and we can safely merge them into the TopK node. "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4fc8fbf5",
+   "metadata": {},
+   "source": [
+    "These pre-processing and post-processing steps are network dependent and we will need to write **custom steps** that can then be executed using the FINN builder tool.\n",
+    "\n",
+    "In the next section we will first look into how a standard build step inside FINN looks like and then we will write our own custom steps for pre- and post-processing and add them to the builder configuration."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7e561a91",
+   "metadata": {},
+   "source": [
+    "## Build steps <a id=\"build_step\"></a>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "fb18b21d",
+   "metadata": {},
+   "source": [
+    "The following steps are executed when using the `estimates_only`-flow."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f3fe1186",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(\"\\n\".join(build_cfg.estimate_only_dataflow_steps))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "dd3ef987",
+   "metadata": {},
+   "source": [
+    "You can have a closer look at each step by either using the `showSrc()` function or by accessing the doc string."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "313fac18",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import finn.builder.build_dataflow_steps as build_dataflow_steps\n",
+    "print(build_dataflow_steps.step_tidy_up.__doc__)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "029da0da",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import finn.builder.build_dataflow_steps as build_dataflow_steps\n",
+    "showSrc(build_dataflow_steps.step_tidy_up)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2809f6a7",
+   "metadata": {},
+   "source": [
+    "Each steps gets the model (`model: ModelWrapper`) and the build configuration (`cfg: DataflowBuildConfig`) as input arguments. Then a certain sequence of transformations is applied to the model. In some of the steps, verification can be run to ensure that the applied transformations have not changed the behaviour of the network. In the end the modified model is returned."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e9c2c97f",
+   "metadata": {},
+   "source": [
+    "### How to make a custom build step <a id=\"custom_step\"></a>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "537a44e7",
+   "metadata": {},
+   "source": [
+    "When writing our own custom steps, we use the same pattern. See below the code for the pre-processing for the example network."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b9d43cc8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from finn.util.pytorch import ToTensor\n",
+    "from qonnx.transformation.merge_onnx_models import MergeONNXModels\n",
+    "from qonnx.core.modelwrapper import ModelWrapper\n",
+    "from qonnx.core.datatype import DataType\n",
+    "\n",
+    "def custom_step_add_pre_proc(model: ModelWrapper, cfg: build.DataflowBuildConfig):\n",
+    "    ishape = model.get_tensor_shape(model.graph.input[0].name)\n",
+    "    # preprocessing: torchvision's ToTensor divides uint8 inputs by 255\n",
+    "    preproc = ToTensor()\n",
+    "    export_qonnx(preproc, torch.randn(ishape), \"preproc.onnx\", opset_version=11)\n",
+    "    preproc_model = ModelWrapper(\"preproc.onnx\")\n",
+    "    # set input finn datatype to UINT8\n",
+    "    preproc_model.set_tensor_datatype(preproc_model.graph.input[0].name, DataType[\"UINT8\"])\n",
+    "    # merge pre-processing onnx model with cnv model (passed as input argument)\n",
+    "    model = model.transform(MergeONNXModels(preproc_model))\n",
+    "    return model\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7a6798aa",
+   "metadata": {},
+   "source": [
+    "In the next step we can modify the builder configuration to execute a custom sequence of builder steps, including the newly implemented pre-processing custom step.\n",
+    "\n",
+    "For that we create a list `build_steps` which contains next to the standard steps from the `estimate_only` flow, also the new custom step to add the pre-processing. This list then gets passed in the build configuration."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6f00b465",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## Builder flow with custom step for pre-processing\n",
+    "\n",
+    "model_dir = os.environ['FINN_ROOT'] + \"/notebooks/advanced\"\n",
+    "model_file = model_dir + \"/end2end_cnv_w2a2_export.onnx\"\n",
+    "\n",
+    "output_dir = build_dir + \"/output_pre_proc\"\n",
+    "\n",
+    "#Delete previous run results if exist\n",
+    "if os.path.exists(output_dir):\n",
+    "    shutil.rmtree(output_dir)\n",
+    "    print(\"Previous run results deleted!\")\n",
+    "\n",
+    "build_steps = [\n",
+    "    custom_step_add_pre_proc,\n",
+    "    \"step_qonnx_to_finn\",\n",
+    "    \"step_tidy_up\",\n",
+    "    \"step_streamline\",\n",
+    "    \"step_convert_to_hls\",\n",
+    "    \"step_create_dataflow_partition\",\n",
+    "    \"step_target_fps_parallelization\",\n",
+    "    \"step_apply_folding_config\",\n",
+    "    \"step_minimize_bit_width\",\n",
+    "    \"step_generate_estimate_reports\",\n",
+    "]\n",
+    "\n",
+    "cfg_estimates = build.DataflowBuildConfig(\n",
+    "    output_dir          = output_dir,\n",
+    "    mvau_wwidth_max     = 80,\n",
+    "    target_fps          = 10000,\n",
+    "    synth_clk_period_ns = 10.0,\n",
+    "    fpga_part           = \"xc7z020clg400-1\",\n",
+    "    steps               = build_steps,\n",
+    "    generate_outputs=[\n",
+    "        build_cfg.DataflowOutputType.ESTIMATE_REPORTS,\n",
+    "    ]\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d3a2bcea",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "build.build_dataflow_cfg(model_file, cfg_estimates)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "51b7dbd5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!ls -t -r {build_dir}/output_pre_proc/intermediate_models"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4690049f",
+   "metadata": {},
+   "source": [
+    "An intermediate .onnx file after the execution of the custom step was automatically created, let's have a look at the graph."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "87e5651e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "showInNetron(build_dir+\"/output_pre_proc/intermediate_models/custom_step_add_pre_proc.onnx\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "90c6bef9",
+   "metadata": {},
+   "source": [
+    "The graph is in QONNX format and a division by 255 is inserted in the beginning. We can now use the CIFAR-10 images directly as input to the graph and the new `global_in` tensor is UINT8.\n",
+    "\n",
+    "You can already have a look on how the intermediate models have changed by modifying the code in the cell above. Before we go into more detail, we will add another custom step to insert the post-processing. In this case this means the insertion of a TopK node."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8c6f1bd0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from qonnx.transformation.insert_topk import InsertTopK\n",
+    "\n",
+    "def custom_step_add_post_proc(model: ModelWrapper, cfg: build.DataflowBuildConfig):\n",
+    "    model = model.transform(InsertTopK(k=1))\n",
+    "    return model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "57adbb44",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## Builder flow with custom step for pre-processing and post-processing\n",
+    "\n",
+    "model_dir = os.environ['FINN_ROOT'] + \"/notebooks/advanced\"\n",
+    "model_file = model_dir + \"/end2end_cnv_w2a2_export.onnx\"\n",
+    "\n",
+    "output_dir = build_dir + \"/output_pre_and_post_proc\"\n",
+    "\n",
+    "#Delete previous run results if exist\n",
+    "if os.path.exists(output_dir):\n",
+    "    shutil.rmtree(output_dir)\n",
+    "    print(\"Previous run results deleted!\")\n",
+    "\n",
+    "build_steps = [\n",
+    "    custom_step_add_pre_proc,\n",
+    "    custom_step_add_post_proc,\n",
+    "    \"step_qonnx_to_finn\",\n",
+    "    \"step_tidy_up\",\n",
+    "    \"step_streamline\",\n",
+    "    \"step_convert_to_hls\",\n",
+    "    \"step_create_dataflow_partition\",\n",
+    "    \"step_target_fps_parallelization\",\n",
+    "    \"step_apply_folding_config\",\n",
+    "    \"step_minimize_bit_width\",\n",
+    "    \"step_generate_estimate_reports\",\n",
+    "]\n",
+    "\n",
+    "cfg_estimates = build.DataflowBuildConfig(\n",
+    "    output_dir          = output_dir,\n",
+    "    mvau_wwidth_max     = 80,\n",
+    "    target_fps          = 10000,\n",
+    "    synth_clk_period_ns = 10.0,\n",
+    "    fpga_part           = \"xc7z020clg400-1\",\n",
+    "    steps               = build_steps,\n",
+    "    generate_outputs=[\n",
+    "        build_cfg.DataflowOutputType.ESTIMATE_REPORTS,\n",
+    "    ]\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b0598b81",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "build.build_dataflow_cfg(model_file, cfg_estimates);"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "95230896",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!ls -t -r {build_dir}/output_pre_and_post_proc/intermediate_models"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3a0263b1",
+   "metadata": {},
+   "source": [
+    "You can use the code in the cell below to investigate the generated intermediate models. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "44127417",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_to_investigate = \"custom_step_add_post_proc.onnx\"\n",
+    "showInNetron(build_dir+\"/output_pre_and_post_proc/intermediate_models/\"+model_to_investigate)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5cc97505",
+   "metadata": {},
+   "source": [
+    "Let's have a look at the model after the conversion to hls, to verify that now all layers are correctly converted."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "63131e3e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "showInNetron(build_dir+\"/output_pre_and_post_proc/intermediate_models/step_convert_to_hls.onnx\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8fd0af6b",
+   "metadata": {},
+   "source": [
+    "The model contains now a `Thresholding` layer in the beginning and a `LabelSelect_Batch` layer at the end. Please note, that there is still a `Transpose` node as the first layer of the graph, but we can solve this by converting the input data to the NHWC format before streaming it into the FINN accelerator."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5ffbadd1",
+   "metadata": {},
+   "source": [
+    "## Folding configuration json <a id=\"folding_config\"></a>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c164040f",
+   "metadata": {},
+   "source": [
+    "The FINN compiler allows the user to implement a network in streaming dataflow architecture, this means every layer is implemented individually and the data is streamed through the accelerator. We can customize each layer for specific performance and resource requirements by adjusting the parallelism and resource type of each layer. In the FINN context we refer to this customization of parallelism in each layer as folding. To learn more details about the influence of folding factors/parallelism in FINN, please have a look at our [folding tutorial](3_folding.ipynb).\n",
+    "\n",
+    "In this section, we will look into the interface over which we can influence the customization of each layer using the FINN builder tool: A json file containing the folding configuration."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1299b86d",
+   "metadata": {},
+   "source": [
+    "Depending on the invoked step, the FINN compiler can produce or consume a .json file containing the folding configuration for each layer. In the cell below, we will have a look at the automatically generated .json file, which is produced by `step_target_fps_parallelization`. We use this then as starting point to manipulate the folding configuration and feed it back into the builder tool."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f75f5634",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "\n",
+    "with open(build_dir+\"/output_pre_and_post_proc/auto_folding_config.json\", 'r') as json_file:\n",
+    "    folding_config = json.load(json_file)\n",
+    "\n",
+    "print(json.dumps(folding_config, indent=1))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8de787a7",
+   "metadata": {},
+   "source": [
+    "As you can see from the printed cell above, the keys in the .json file are the node names of the layers in our network. For each of the layers, some node attributes are listed:\n",
+    "* `PE` and `SIMD` are the folding parameters that determine the parallelism of each layer, depending on the layer they can be set to different values, for details refer to [this table](https://finn-dev.readthedocs.io/en/latest/internals.html#constraints-to-folding-factors-per-layer).\n",
+    "* `mem_mode`: determines if the parameter memory will be implemented as part of the HLS code (`const`) or instantiated separately and connected with the layer over a memory streamer unit (`decoupled`). You can find more details in this part of the documentation: https://finn-dev.readthedocs.io/en/latest/internals.html#matrixvectoractivation-mem-mode . It is also possible to set the mem_mode to external which allows for the implementation for external weights.\n",
+    "* `ram_style`: when selecting `decoupled` mode, the FINN compiler allows us to choose which memory resource will be used for the layer. The argument `ram_style` is set to the selected memory type:\n",
+    "    * `auto`: Vivado will make the decision if the implementation is using LUTRAM or BRAM\n",
+    "    * `distributed`: LUTRAM will be used\n",
+    "    * `block`: BRAM will be used\n",
+    "    * `ultra`: URAM will be used, if available on the selected board\n",
+    "\n",
+    "* `resType`: This is a node attribute for the MVAU layer and can be set to `lut` or `dsp`. Please note that selecting `dsp` will not enable the optimized RTL variant of the MVAU but rather generate HLS code utilizing DSPs, this is not optimal yet but can give an additional parameter for design space exploration.\n",
+    "* `runtime_writeable_weights`: FINN offers the option to implement the weights as \"runtime writable\", this means you can write the weight values from the driver via an axilite interface."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "fd1519fe",
+   "metadata": {},
+   "source": [
+    "In the following part of the tutorial, we will use the auto generated json file as starting point to create two new json files which explore the `ram_style` attribute. We will use one of the generated reports from the FINN builder to see the impact of these changes.\n",
+    "For that, we will extract the total resources from the *estimate_layer_resources.json* report in the following cell."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f7f42774",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open(build_dir+\"/output_pre_and_post_proc/report/estimate_layer_resources.json\", 'r') as json_file:\n",
+    "    json_object = json.load(json_file)\n",
+    "\n",
+    "print(json.dumps(json_object[\"total\"], indent=1))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0be3b0e1",
+   "metadata": {},
+   "source": [
+    "The FINN compiler estimates the network to use ~500 BRAM blocks and ~100k LUTs."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d4d177dc",
+   "metadata": {},
+   "source": [
+    "We will use the `auto_folding_config.json` and create two folding configuration from that file:\n",
+    "* All `ram_style` attributes set to `distributed`\n",
+    "* All `ram_style` attributes set to `block`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "112af6fd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open(build_dir+\"/output_pre_and_post_proc/auto_folding_config.json\", 'r') as json_file:\n",
+    "    folding_config = json.load(json_file)\n",
+    "\n",
+    "# Set all ram_style to LUT RAM\n",
+    "for key in folding_config:\n",
+    "    if \"ram_style\" in folding_config[key]:\n",
+    "        folding_config[key][\"ram_style\"] = \"distributed\" \n",
+    "# Save as .json    \n",
+    "with open(\"folding_config_all_lutram.json\", \"w\") as jsonFile:\n",
+    "    json.dump(folding_config, jsonFile)\n",
+    "         \n",
+    "# Set all ram_style to BRAM\n",
+    "for key in folding_config:\n",
+    "    if \"ram_style\" in folding_config[key]:\n",
+    "        folding_config[key][\"ram_style\"] = \"block\" \n",
+    "# Save as .json    \n",
+    "with open(\"folding_config_all_bram.json\", \"w\") as jsonFile:\n",
+    "    json.dump(folding_config, jsonFile)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0e64a499",
+   "metadata": {},
+   "source": [
+    "After generating these files, we will invoke the builder flow. To enable the FINN builder to take the generated folding configuration as input, we will need to set the additional builder argument `folding_config_file` and we will change the `build_steps` to not run `step_target_fps_parallelization`. The build step does not necessarily need to be excluded, but since we pass a separate folding configuration, the output from that step would be overwritten anyways, so we skip it for a faster execution."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cdd9f706",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## Build flow with custom folding configuration\n",
+    "## folding_config_file = \"folding_config_all_lutram.json\"\n",
+    "\n",
+    "model_dir = os.environ['FINN_ROOT'] + \"/notebooks/advanced\"\n",
+    "model_file = model_dir + \"/end2end_cnv_w2a2_export.onnx\"\n",
+    "\n",
+    "output_dir = build_dir + \"/output_all_lutram\"\n",
+    "\n",
+    "#Delete previous run results if exist\n",
+    "if os.path.exists(output_dir):\n",
+    "    shutil.rmtree(output_dir)\n",
+    "    print(\"Previous run results deleted!\")\n",
+    "\n",
+    "build_steps = [\n",
+    "    custom_step_add_pre_proc,\n",
+    "    custom_step_add_post_proc,\n",
+    "    \"step_qonnx_to_finn\",\n",
+    "    \"step_tidy_up\",\n",
+    "    \"step_streamline\",\n",
+    "    \"step_convert_to_hls\",\n",
+    "    \"step_create_dataflow_partition\",\n",
+    "    \"step_apply_folding_config\",\n",
+    "    \"step_minimize_bit_width\",\n",
+    "    \"step_generate_estimate_reports\",\n",
+    "]\n",
+    "\n",
+    "cfg_estimates = build.DataflowBuildConfig(\n",
+    "    output_dir          = output_dir,\n",
+    "    mvau_wwidth_max     = 80,\n",
+    "    synth_clk_period_ns = 10.0,\n",
+    "    fpga_part           = \"xc7z020clg400-1\",\n",
+    "    steps               = build_steps,\n",
+    "    folding_config_file = \"folding_config_all_lutram.json\",\n",
+    "    generate_outputs=[\n",
+    "        build_cfg.DataflowOutputType.ESTIMATE_REPORTS,\n",
+    "    ]\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "99b647c0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "build.build_dataflow_cfg(model_file, cfg_estimates);"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e705767d",
+   "metadata": {},
+   "source": [
+    "We can now have a look at the produced model, when clicking on the individual nodes, you can see that all layers have the node attribute `ram_style` set to `distributed`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cc680178",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "showInNetron(build_dir+\"/output_all_lutram/intermediate_models/step_generate_estimate_reports.onnx\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "695ecfb1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open(build_dir+\"/output_all_lutram/report/estimate_layer_resources.json\", 'r') as json_file:\n",
+    "    json_object = json.load(json_file)\n",
+    "\n",
+    "print(json.dumps(json_object[\"total\"], indent=1))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "55208c70",
+   "metadata": {},
+   "source": [
+    "The estimation report shows that BRAM utilization is down to zero and the LUT count went up to around 150k."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "11b8430a",
+   "metadata": {},
+   "source": [
+    "Let's do the same with the folding configuration which sets all memory resources to use BRAM."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "59e8aaaa",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## Build flow with custom folding configuration\n",
+    "## folding_config_file = \"folding_config_all_bram.json\"\n",
+    "\n",
+    "model_dir = os.environ['FINN_ROOT'] + \"/notebooks/advanced\"\n",
+    "model_file = model_dir + \"/end2end_cnv_w2a2_export.onnx\"\n",
+    "\n",
+    "output_dir = build_dir + \"/output_all_bram\"\n",
+    "\n",
+    "#Delete previous run results if exist\n",
+    "if os.path.exists(output_dir):\n",
+    "    shutil.rmtree(output_dir)\n",
+    "    print(\"Previous run results deleted!\")\n",
+    "\n",
+    "build_steps = [\n",
+    "    custom_step_add_pre_proc,\n",
+    "    custom_step_add_post_proc,\n",
+    "    \"step_qonnx_to_finn\",\n",
+    "    \"step_tidy_up\",\n",
+    "    \"step_streamline\",\n",
+    "    \"step_convert_to_hls\",\n",
+    "    \"step_create_dataflow_partition\",\n",
+    "    \"step_apply_folding_config\",\n",
+    "    \"step_minimize_bit_width\",\n",
+    "    \"step_generate_estimate_reports\",\n",
+    "]\n",
+    "\n",
+    "cfg_estimates = build.DataflowBuildConfig(\n",
+    "    output_dir          = output_dir,\n",
+    "    mvau_wwidth_max     = 80,\n",
+    "    synth_clk_period_ns = 10.0,\n",
+    "    fpga_part           = \"xc7z020clg400-1\",\n",
+    "    steps               = build_steps,\n",
+    "    folding_config_file = \"folding_config_all_bram.json\",\n",
+    "    generate_outputs=[\n",
+    "        build_cfg.DataflowOutputType.ESTIMATE_REPORTS,\n",
+    "    ]\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2cdc1aa0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "build.build_dataflow_cfg(model_file, cfg_estimates);"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cd0388fd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "showInNetron(build_dir+\"/output_all_bram/intermediate_models/step_generate_estimate_reports.onnx\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e60a3efb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open(build_dir+\"/output_all_bram/report/estimate_layer_resources.json\", 'r') as json_file:\n",
+    "    json_object = json.load(json_file)\n",
+    "\n",
+    "print(json.dumps(json_object[\"total\"], indent=1))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "97f87780",
+   "metadata": {},
+   "source": [
+    "The initial implementation already had a high utilization of BRAM, but the estimations went now up to 522 BRAMs while the LUT count went down to ~99k."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e65a8ded",
+   "metadata": {},
+   "source": [
+    "You can use this example as a starting point to manipulate the folding configuration yourself. Instead of using the above code, you can also manually open one of the example .json files and set the values differently. Please be aware that the node attributes can not be set to arbitrary values. Especially the folding factors need to fulfil [certain constraints](https://finn-dev.readthedocs.io/en/latest/internals.html#constraints-to-folding-factors-per-layer). The other settings for node attributes, can be best looked up in the individual custom operator classes: [e.g. for MVAU](https://github.com/Xilinx/finn/blob/dev/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py#L64)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4a675834",
+   "metadata": {},
+   "source": [
+    "## Additional builder arguments <a id=\"builder_arg\"></a>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f7012b9a",
+   "metadata": {},
+   "source": [
+    "In this section, we will have a peak into additional builder arguments the FINN compiler exposes. We will not be able to cover all but you will be able to have a look at a list and we encourage you to take your time to look into the different options there are to customize the FINN builder configuration."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "467d8829",
+   "metadata": {},
+   "source": [
+    "We start by enabling the verification flow in the builder. The FINN compiler applies multiple transformations to the model before it gets turned into hardware, so we need to make sure that the functional behavior of the network does not change."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e0c167f4",
+   "metadata": {},
+   "source": [
+    "### Verification steps <a id=\"verify\"></a>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "308d52ba",
+   "metadata": {},
+   "source": [
+    "Earlier in the tutorial, we had a look at how build steps are written. When investigating the `step_tidy_up`, we can see that before the changed model is returned a verification step can be run. In the case of `step_tidy_up` it is the step `\"initial python\"` that can be initiated by setting `VerificationStepType.TIDY_UP_PYTHON`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4fe7318e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import finn.builder.build_dataflow_steps as build_dataflow_steps\n",
+    "showSrc(build_dataflow_steps.step_tidy_up)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2bbb84fb",
+   "metadata": {},
+   "source": [
+    "Some of the default build steps have automatic verification enabled, when the corresponding verification step is set."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ce1aa025",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "showSrc(build_cfg.VerificationStepType)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "da1a2b88",
+   "metadata": {},
+   "source": [
+    "In the cells below, we will use an example input from the CIFAR-10 data set and use the forward pass in Brevitas to generate a reference output. We save the input as `input.npy` and the reference output as `expected_output.npy`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e157d03c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Get golden io pair from Brevitas and save as .npy files\n",
+    "from finn.util.test import get_trained_network_and_ishape, get_example_input, get_topk\n",
+    "import numpy as np\n",
+    "\n",
+    "\n",
+    "(brevitas_model, ishape) = get_trained_network_and_ishape(\"cnv\", 2, 2)\n",
+    "input_tensor_npy = get_example_input(\"cnv\")\n",
+    "input_tensor_torch = torch.from_numpy(input_tensor_npy).float()\n",
+    "input_tensor_torch = ToTensor().forward(input_tensor_torch).detach()\n",
+    "output_tensor_npy = brevitas_model.forward(input_tensor_torch).detach().numpy()\n",
+    "output_tensor_npy = get_topk(output_tensor_npy, k=1)\n",
+    "\n",
+    "np.save(\"input.npy\", input_tensor_npy)\n",
+    "np.save(\"expected_output.npy\", output_tensor_npy)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d03450e7",
+   "metadata": {},
+   "source": [
+    "In the next step we set up the builder flow again, this time we will set the build argument `verify_steps` and pass a list of verification steps."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5cd3032b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## Build flow with additional builder arguments enabled\n",
+    "## verification steps\n",
+    "\n",
+    "model_dir = os.environ['FINN_ROOT'] + \"/notebooks/advanced\"\n",
+    "model_file = model_dir + \"/end2end_cnv_w2a2_export.onnx\"\n",
+    "\n",
+    "output_dir = build_dir + \"/output_with_verification\"\n",
+    "\n",
+    "#Delete previous run results if exist\n",
+    "if os.path.exists(output_dir):\n",
+    "    shutil.rmtree(output_dir)\n",
+    "    print(\"Previous run results deleted!\")\n",
+    "\n",
+    "build_steps = [\n",
+    "    custom_step_add_pre_proc,\n",
+    "    custom_step_add_post_proc,\n",
+    "    \"step_qonnx_to_finn\",\n",
+    "    \"step_tidy_up\",\n",
+    "    \"step_streamline\",\n",
+    "    \"step_convert_to_hls\",\n",
+    "    \"step_create_dataflow_partition\",\n",
+    "    \"step_target_fps_parallelization\",\n",
+    "    \"step_apply_folding_config\",\n",
+    "    \"step_minimize_bit_width\",\n",
+    "    \"step_generate_estimate_reports\",\n",
+    "]\n",
+    "\n",
+    "cfg_estimates = build.DataflowBuildConfig(\n",
+    "    output_dir          = output_dir,\n",
+    "    mvau_wwidth_max     = 80,\n",
+    "    target_fps          = 10000,\n",
+    "    synth_clk_period_ns = 10.0,\n",
+    "    fpga_part           = \"xc7z020clg400-1\",\n",
+    "    steps               = build_steps,\n",
+    "    generate_outputs=[\n",
+    "        build_cfg.DataflowOutputType.ESTIMATE_REPORTS,\n",
+    "    ],\n",
+    "    verify_steps=[\n",
+    "        build_cfg.VerificationStepType.QONNX_TO_FINN_PYTHON,\n",
+    "        build_cfg.VerificationStepType.TIDY_UP_PYTHON,\n",
+    "        build_cfg.VerificationStepType.STREAMLINED_PYTHON,\n",
+    "    ]\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1d05b985",
+   "metadata": {},
+   "source": [
+    "When execution the code below, the verification will be invoked in the background. After the execution we can check if the verification was successful by investigating the output directory."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a3a46e76",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "build.build_dataflow_cfg(model_file, cfg_estimates);"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ca1d571d",
+   "metadata": {},
+   "source": [
+    "The output directory has now an additional directory called `verification_output`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ca74d537",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!ls {build_dir}/output_with_verification"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "908ecda4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!ls {build_dir}/output_with_verification/verification_output"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "bcbc6f49",
+   "metadata": {},
+   "source": [
+    "The directory contains three .npy files. These files are the saved output files from the different verification steps. The suffix indicates if the array matches with the expected output. In our case, the suffix is for all verification steps `_SUCCESS`. Since the outputs are saved as .npy, we can open and investigate the files simply in Python."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7a1b6ca9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "verify_initial_python = np.load(build_dir + \"/output_with_verification/verification_output/verify_initial_python_0_SUCCESS.npy\")\n",
+    "print(\"The output of the verification step after the step_tidy_up is: \" + str(verify_initial_python))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6558e19e",
+   "metadata": {},
+   "source": [
+    "If the generated output does not match the expected output, these files can be used for debugging."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4609f94d",
+   "metadata": {},
+   "source": [
+    "### Other builder arguments <a id=\"other_args\"></a>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "37b6853d",
+   "metadata": {},
+   "source": [
+    "Next to the enablement of the verification flows, the FINN builder has numerous additional builder arguments to further customize your network. \n",
+    "Let's have a look at the options for the arguments. We want to only filter out the FINN specific arguments."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e9f6aa29",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Filter out methods\n",
+    "builder_args = [m for m in dir(build_cfg.DataflowBuildConfig) if not m.startswith('_')]\n",
+    "print(\"\\n\".join(builder_args))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b12ab370",
+   "metadata": {},
+   "source": [
+    "There are attributes that come from the dataclasses-json class: `to_dict`, `to_json`, `schema`, `from_json`, `from_dict`. This class is used for the implementation of the FINN builder. In this tutorial, we are mainly interested in the FINN specific arguments.  \n",
+    "\n",
+    "Some of these arguments we have seen already in the Cybersecurity notebook and in this notebook, e.g. target_fps, fpga_part and folding_config_file. In the code of the FINN builder, the function of each builder argument is documents, you can have a look [here](https://github.com/Xilinx/finn/blob/dev/src/finn/builder/build_dataflow_config.py#L155) and scroll through the available builder arguments."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9aba0493",
+   "metadata": {},
+   "source": [
+    "So far, in this notebook, we only looked at configurations up to the generation of estimate reports, a lot of these builder arguments actually become relevant at a later stage in the FINN flow.\n",
+    "\n",
+    "Let's have a look at the default build dataflow steps for the complete FINN flow."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ec39b9f2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(\"\\n\".join(build_cfg.default_build_dataflow_steps))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b9bc5715",
+   "metadata": {},
+   "source": [
+    "You can see that after the generation of the estimate reports, the code generation and the ip generation is invoked (`step_hls_codegen` and `step_hls_ipgen`). The FIFO depths are determined and the FIFOs are inserted in the network (`step_set_fifo_depths`), we can then create an IP design of our whole network by stitching the IPs from each layer together (`step_create_stitched_ip`). At this point we have an implementation of the neural network that we can integrate within a bigger FPGA design, we can run performance measurements using simulation (`step_measure_rtlsim_performance`) and out-of-context synthesis (`step_out_of_context_synthesis`) for it.\n",
+    "The FINN builder also provides automatic system integration for Zynq and Alveo devices, this can be invoked by running `step_synthesize_bitfile`, `step_make_pynq_driver` and `step_deployment_package`."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "76df000f",
+   "metadata": {},
+   "source": [
+    "You can have a closer look at each step by either using the `showSrc()` function or by accessing the doc string."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "caf49f03",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import finn.builder.build_dataflow_steps as build_dataflow_steps\n",
+    "print(build_dataflow_steps.step_hls_codegen.__doc__)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c84a9fbc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "showSrc(build_dataflow_steps.step_hls_codegen)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c249f141",
+   "metadata": {},
+   "source": [
+    "This concludes the advanced builder settings tutorial. Below you can find code that can help you investigating more of the builder arguments and invoking the whole flow to generate a bitfile."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3b98eb65",
+   "metadata": {},
+   "source": [
+    "### Examples for additional builder arguments & bitfile generation <a id=\"example_args\"></a>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0dbdab42",
+   "metadata": {},
+   "source": [
+    "#### Standalone Thresholds"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e21ff36f",
+   "metadata": {},
+   "source": [
+    "In FINN, convolutions are expressed with three components:\n",
+    "* An Im2Col operation\n",
+    "* A matrix multiplication\n",
+    "* A MultiThreshold operation\n",
+    "\n",
+    "When converting these nodes into HLS layers, by default the MatMul and the MultiThreshold gets converted into **one** component called Matrix-Vector-Activation Unit (MVAU). But the FINN compiler allows us to implement the activation separately. This gives an additional possibility for customization because we can adjust the folding parameters of the standalone threshold unit independently. \n",
+    "\n",
+    "If you would like to enable this feature, you can set the build argument `standalone_thresholds` to `True`. In the code below this feature is enabled and you can have a look at the generated .onnx file. Please note that you need to uncomment the code first."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2619ebde",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## Build flow with additional builder arguments enabled\n",
+    "## standalone_thresholds = True\n",
+    "\n",
+    "model_dir = os.environ['FINN_ROOT'] + \"/notebooks/advanced\"\n",
+    "model_file = model_dir + \"/end2end_cnv_w2a2_export.onnx\"\n",
+    "\n",
+    "output_dir = build_dir + \"/output_standalone_thresholds\"\n",
+    "\n",
+    "#Delete previous run results if exist\n",
+    "if os.path.exists(output_dir):\n",
+    "    shutil.rmtree(output_dir)\n",
+    "    print(\"Previous run results deleted!\")\n",
+    "\n",
+    "build_steps = [\n",
+    "    custom_step_add_pre_proc,\n",
+    "    custom_step_add_post_proc,\n",
+    "    \"step_qonnx_to_finn\",\n",
+    "    \"step_tidy_up\",\n",
+    "    \"step_streamline\",\n",
+    "    \"step_convert_to_hls\",\n",
+    "    \"step_create_dataflow_partition\",\n",
+    "    \"step_target_fps_parallelization\",\n",
+    "    \"step_apply_folding_config\",\n",
+    "    \"step_minimize_bit_width\",\n",
+    "    \"step_generate_estimate_reports\",\n",
+    "]\n",
+    "\n",
+    "cfg_estimates = build.DataflowBuildConfig(\n",
+    "    output_dir            = output_dir,\n",
+    "    mvau_wwidth_max       = 80,\n",
+    "    target_fps            = 10000,\n",
+    "    synth_clk_period_ns   = 10.0,\n",
+    "    fpga_part             = \"xc7z020clg400-1\",\n",
+    "    standalone_thresholds = True,\n",
+    "    steps                 = build_steps,\n",
+    "    generate_outputs=[\n",
+    "        build_cfg.DataflowOutputType.ESTIMATE_REPORTS,\n",
+    "    ],\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b2e9bc42",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#%%time\n",
+    "#build.build_dataflow_cfg(model_file, cfg_estimates);"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "32ae296e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#showInNetron(build_dir+\"/output_standalone_thresholds/intermediate_models/step_generate_estimate_reports.onnx\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "074d8253",
+   "metadata": {},
+   "source": [
+    "#### RTL Convolutional Input Generator"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b85e5ac7",
+   "metadata": {},
+   "source": [
+    "Recently, we have worked on the *Operator Hardening* in the FINN compiler. This means that we implement core building blocks in RTL instead of using HLS.\n",
+    "One of these components is already available in the FINN compiler, you can enable the RTL implementation of the ConvolutionInputGenerator (aka Sliding Window Generator) by setting the build argument `force_rtl_conv_inp_gen` to `True`.\n",
+    "In the code below this feature is enabled and you can have a look at the generated .onnx file. Please note that you need to uncomment the code first."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2a90b63f",
+   "metadata": {},
+   "source": [
+    "<div class=\"alert alert-block alert-info\">\n",
+    "<b>Important notice:</b> We are actively working on the integration of RTL components in the FINN flow, the enablement like shown below might change in the future.\n",
+    "</div>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ab0c4974",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## Build flow with additional builder arguments enabled\n",
+    "## force_rtl_conv_inp_gen = True\n",
+    "\n",
+    "model_dir = os.environ['FINN_ROOT'] + \"/notebooks/advanced\"\n",
+    "model_file = model_dir + \"/end2end_cnv_w2a2_export.onnx\"\n",
+    "\n",
+    "output_dir = build_dir + \"/output_rtl_swg\"\n",
+    "\n",
+    "#Delete previous run results if exist\n",
+    "if os.path.exists(output_dir):\n",
+    "    shutil.rmtree(output_dir)\n",
+    "    print(\"Previous run results deleted!\")\n",
+    "\n",
+    "build_steps = [\n",
+    "    custom_step_add_pre_proc,\n",
+    "    custom_step_add_post_proc,\n",
+    "    \"step_qonnx_to_finn\",\n",
+    "    \"step_tidy_up\",\n",
+    "    \"step_streamline\",\n",
+    "    \"step_convert_to_hls\",\n",
+    "    \"step_create_dataflow_partition\",\n",
+    "    \"step_target_fps_parallelization\",\n",
+    "    \"step_apply_folding_config\",\n",
+    "    \"step_minimize_bit_width\",\n",
+    "    \"step_generate_estimate_reports\",\n",
+    "]\n",
+    "\n",
+    "cfg_estimates = build.DataflowBuildConfig(\n",
+    "    output_dir             = output_dir,\n",
+    "    mvau_wwidth_max        = 80,\n",
+    "    target_fps             = 10000,\n",
+    "    synth_clk_period_ns    = 10.0,\n",
+    "    fpga_part              = \"xc7z020clg400-1\",\n",
+    "    force_rtl_conv_inp_gen = True,\n",
+    "    steps                  = build_steps,\n",
+    "    generate_outputs=[\n",
+    "        build_cfg.DataflowOutputType.ESTIMATE_REPORTS,\n",
+    "    ],\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "19fe4d85",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#%%time\n",
+    "#build.build_dataflow_cfg(model_file, cfg_estimates);"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4c1f1ce9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#showInNetron(build_dir+\"/output_rtl_swg/intermediate_models/step_generate_estimate_reports.onnx\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "601eb5f8",
+   "metadata": {},
+   "source": [
+    "#### Run the whole flow"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "42aa929b",
+   "metadata": {},
+   "source": [
+    "The code below can be used to invoke the full builder flow and obtain more output products, be aware that this runs synthesis and bitfile generation and it might take over an hour. Please note that you need to uncomment the code first."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ffa2a352",
+   "metadata": {},
+   "source": [
+    "For an optimized design, we download the folding configuration for cnv-w2a2 on the Pynq-Z1 board from [finn-examples](https://github.com/Xilinx/finn-examples). And will pass it to the build flow. Please also note below that we now pass the board as argument to the builder (`board = \"Pynq-Z1\"`) instead of just the fpga part. This time we will select all possible outputs to generate. Please be aware that running the full build might take a few hours."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "765e5ee7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!wget https://raw.githubusercontent.com/Xilinx/finn-examples/main/build/bnn-pynq/folding_config/cnv-w2a2_folding_config.json"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4efd46f4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import finn.builder.build_dataflow as build\n",
+    "import finn.builder.build_dataflow_config as build_cfg\n",
+    "import os\n",
+    "import shutil\n",
+    "\n",
+    "## Build flow with hardware build\n",
+    "\n",
+    "model_dir = os.environ['FINN_ROOT'] + \"/notebooks/advanced\"\n",
+    "model_file = model_dir + \"/end2end_cnv_w2a2_export.onnx\"\n",
+    "\n",
+    "output_dir = build_dir + \"/output_bitfile\"\n",
+    "\n",
+    "#Delete previous run results if exist\n",
+    "if os.path.exists(output_dir):\n",
+    "    shutil.rmtree(output_dir)\n",
+    "    print(\"Previous run results deleted!\")\n",
+    "\n",
+    "build_steps = [\n",
+    "    custom_step_add_pre_proc,\n",
+    "    custom_step_add_post_proc,\n",
+    "    \"step_qonnx_to_finn\",\n",
+    "    \"step_tidy_up\",\n",
+    "    \"step_streamline\",\n",
+    "    \"step_convert_to_hls\",\n",
+    "    \"step_create_dataflow_partition\",\n",
+    "    \"step_target_fps_parallelization\",\n",
+    "    \"step_apply_folding_config\",\n",
+    "    \"step_minimize_bit_width\",\n",
+    "    \"step_generate_estimate_reports\",\n",
+    "    \"step_hls_codegen\",\n",
+    "    \"step_hls_ipgen\",\n",
+    "    \"step_set_fifo_depths\",\n",
+    "    \"step_create_stitched_ip\",\n",
+    "    \"step_measure_rtlsim_performance\",\n",
+    "    \"step_out_of_context_synthesis\",\n",
+    "    \"step_synthesize_bitfile\",\n",
+    "    \"step_make_pynq_driver\",\n",
+    "    \"step_deployment_package\",\n",
+    "]\n",
+    "\n",
+    "cfg_build = build.DataflowBuildConfig(\n",
+    "    output_dir             = output_dir,\n",
+    "    mvau_wwidth_max        = 80,\n",
+    "    synth_clk_period_ns    = 10.0,\n",
+    "    folding_config_file    = \"cnv-w2a2_folding_config.json\",\n",
+    "    board                  = \"Pynq-Z1\",\n",
+    "    shell_flow_type        = build_cfg.ShellFlowType.VIVADO_ZYNQ,\n",
+    "    steps                  = build_steps,\n",
+    "    generate_outputs=[\n",
+    "        build_cfg.DataflowOutputType.ESTIMATE_REPORTS,\n",
+    "        build_cfg.DataflowOutputType.STITCHED_IP,\n",
+    "        build_cfg.DataflowOutputType.RTLSIM_PERFORMANCE,\n",
+    "        build_cfg.DataflowOutputType.OOC_SYNTH,\n",
+    "        build_cfg.DataflowOutputType.BITFILE,\n",
+    "        build_cfg.DataflowOutputType.PYNQ_DRIVER,\n",
+    "        build_cfg.DataflowOutputType.DEPLOYMENT_PACKAGE,\n",
+    "    ],\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c7ff6c19",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#%%time\n",
+    "#build.build_dataflow_cfg(model_file, cfg_build);"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/notebooks/advanced/cybsec_PE_SIMD.onnx b/notebooks/advanced/cybsec_PE_SIMD.onnx
new file mode 100644
index 0000000000..b450cc9e43
Binary files /dev/null and b/notebooks/advanced/cybsec_PE_SIMD.onnx differ
diff --git a/notebooks/advanced/finn-dataflow.png b/notebooks/advanced/finn-dataflow.png
new file mode 100755
index 0000000000..ebe98d0fbd
Binary files /dev/null and b/notebooks/advanced/finn-dataflow.png differ
diff --git a/notebooks/advanced/finn-folding-mvau.png b/notebooks/advanced/finn-folding-mvau.png
new file mode 100755
index 0000000000..bbba00182c
Binary files /dev/null and b/notebooks/advanced/finn-folding-mvau.png differ
diff --git a/notebooks/advanced/finn-folding.png b/notebooks/advanced/finn-folding.png
new file mode 100755
index 0000000000..019b4aa1e7
Binary files /dev/null and b/notebooks/advanced/finn-folding.png differ
diff --git a/notebooks/basics/0_how_to_work_with_onnx.ipynb b/notebooks/basics/0_how_to_work_with_onnx.ipynb
index 514efd1693..35a83ea97b 100644
--- a/notebooks/basics/0_how_to_work_with_onnx.ipynb
+++ b/notebooks/basics/0_how_to_work_with_onnx.ipynb
@@ -24,7 +24,7 @@
    "source": [
     "### How to create a simple ONNX model\n",
     "\n",
-    "To explain how to create an ONNX model a simple example with mathematical operations is used. All nodes are from the [standard operations library of ONNX](https://github.com/onnx/onnx/blob/master/docs/Operators.md).\n",
+    "To explain how to create an ONNX model a simple example with mathematical operations is used. All nodes are from the [standard operations library of ONNX](https://github.com/onnx/onnx/blob/main/docs/Operators.md).\n",
     "\n",
     "First ONNX is imported, then the helper function can be used to make a node."
    ]
@@ -36,6 +36,7 @@
    "outputs": [],
    "source": [
     "import onnx\n",
+    "from qonnx.util.basic import qonnx_make_model\n",
     "\n",
     "Add1_node = onnx.helper.make_node(\n",
     "    'Add',\n",
@@ -158,7 +159,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "onnx_model = onnx.helper.make_model(graph, producer_name=\"simple-model\")\n",
+    "onnx_model = qonnx_make_model(graph, producer_name=\"simple-model\")\n",
     "onnx.save(onnx_model, '/tmp/simple_model.onnx')"
    ]
   },
@@ -304,7 +305,7 @@
    "source": [
     "### How to manipulate an ONNX model\n",
     "\n",
-    "In the model there are two successive adder nodes. An adder node in ONNX can only add two inputs, but there is also the [**sum**](https://github.com/onnx/onnx/blob/master/docs/Operators.md#Sum) node, which can process more than two inputs. So it would be a reasonable change of the graph to combine the two successive adder nodes to one sum node."
+    "In the model there are two successive adder nodes. An adder node in ONNX can only add two inputs, but there is also the [**sum**](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Sum) node, which can process more than two inputs. So it would be a reasonable change of the graph to combine the two successive adder nodes to one sum node."
    ]
   },
   {
@@ -550,7 +551,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "onnx_model1 = onnx.helper.make_model(graph, producer_name=\"simple-model1\")\n",
+    "onnx_model1 = qonnx_make_model(graph, producer_name=\"simple-model1\")\n",
     "onnx.save(onnx_model1, '/tmp/simple_model1.onnx')"
    ]
   },
@@ -598,7 +599,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
diff --git a/notebooks/basics/1_brevitas_network_import.ipynb b/notebooks/basics/1_brevitas_network_import_via_QONNX.ipynb
similarity index 65%
rename from notebooks/basics/1_brevitas_network_import.ipynb
rename to notebooks/basics/1_brevitas_network_import_via_QONNX.ipynb
index 5fb29754dc..f15f716e7f 100644
--- a/notebooks/basics/1_brevitas_network_import.ipynb
+++ b/notebooks/basics/1_brevitas_network_import_via_QONNX.ipynb
@@ -4,13 +4,15 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Importing Brevitas networks into FINN\n",
+    "# Importing Brevitas networks into FINN with the QONNX interchange format\n",
+    "\n",
+    "**Note: Previously it was possible to directly export the FINN-ONNX interchange format from Brevitas to pass to the FINN compiler. This support is deprecated and FINN uses the export to the QONNX format as a front end, internally FINN uses still the FINN-ONNX format.**\n",
     "\n",
     "In this notebook we'll go through an example of how to import a Brevitas-trained QNN into FINN. The steps will be as follows:\n",
     "\n",
     "1. Load up the trained PyTorch model\n",
-    "2. Call Brevitas FINN-ONNX export and visualize with Netron\n",
-    "3. Import into FINN and call cleanup transformations\n",
+    "2. Call Brevitas QONNX export and visualize with Netron\n",
+    "3. Import into FINN and converting QONNX to FINN-ONNX\n",
     "\n",
     "We'll use the following utility functions to print the source code for function calls (`showSrc()`) and to visualize a network using netron (`showInNetron()`) in the Jupyter notebook:"
    ]
@@ -120,15 +122,18 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## 2. Call Brevitas FINN-ONNX export and visualize with Netron\n",
+    "## 2. Call Brevitas QONNX export and visualize with Netron\n",
+    "\n",
+    "Brevitas comes with built-in QONNX export functionality. This is similar to the regular ONNX export capabilities of PyTorch, with a few differences:\n",
     "\n",
-    "Brevitas comes with built-in FINN-ONNX export functionality. This is similar to the regular ONNX export capabilities of PyTorch, with a few differences:\n",
+    "1. Weight and activation quantization is represented as a 'fake-quantization' with Quant and BipolarQuant nodes.\n",
+    "2. Truncation operations as required by average pooling are represented with a Trunc node.\n",
     "\n",
-    "1. The weight quantization logic is not exported as part of the graph; rather, the quantized weights themselves are exported.\n",
-    "2. Special quantization annotations are used to preserve the low-bit quantization information. ONNX (at the time of writing) supports 8-bit quantization as the minimum bitwidth, whereas FINN-ONNX quantization annotations can go down to binary/bipolar quantization.\n",
-    "3. Low-bit quantized activation functions are exported as MultiThreshold operators.\n",
+    "One can read more about how QONNX works and why it was developed here: https://xilinx.github.io/finn//2021/11/03/qonnx-and-finn.html\n",
     "\n",
-    "It's actually quite straightforward to export ONNX from our Brevitas model as follows:"
+    "Additionally QONNX comes with a set of tools for working with the format. These are maintained together with the Fast Machinelearning collaboration as an open-source projet here: https://github.com/fastmachinelearning/qonnx\n",
+    "\n",
+    "It's actually quite straightforward to export QONNX from our Brevitas model as follows:"
    ]
   },
   {
@@ -137,10 +142,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import brevitas.onnx as bo\n",
-    "export_onnx_path = \"/tmp/LFCW1A1.onnx\"\n",
+    "from brevitas.export import export_qonnx\n",
+    "export_onnx_path = \"/tmp/LFCW1A1_qonnx.onnx\"\n",
     "input_shape = (1, 1, 28, 28)\n",
-    "bo.export_finn_onnx(lfc, input_shape, export_onnx_path)"
+    "export_qonnx(lfc, torch.randn(input_shape), export_onnx_path);"
    ]
   },
   {
@@ -156,23 +161,23 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "showInNetron('/tmp/LFCW1A1.onnx')"
+    "showInNetron(export_onnx_path)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "When running this notebook in the FINN Docker container, you should be able to see an interactive visualization of the imported network above, and click on individual nodes to inspect their parameters. If you look at any of the MatMul nodes, you should be able to see that the weights are all {-1, +1} values, and the activations are Sign functions."
+    "When running this notebook in the FINN Docker container, you should be able to see an interactive visualization of the imported network above, and click on individual nodes to inspect their parameters. If you look at any of the MatMul nodes, you should be able to see that the weights are all {-1, +1} values."
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## 3. Import into FINN and call cleanup transformations\n",
+    "## 3. Import into FINN and converting QONNX to FINN-ONNX\n",
     "\n",
-    "We will now import this ONNX model into FINN using the ModelWrapper, and examine some of the graph attributes from Python."
+    "Similarily to the 1a notebook we will first run a cleanup transformation on the exported QONNX model."
    ]
   },
   {
@@ -181,16 +186,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from qonnx.core.modelwrapper import ModelWrapper\n",
-    "model = ModelWrapper(export_onnx_path)\n",
-    "model.graph.node[8]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The ModelWrapper exposes a range of other useful functions as well. For instance, by convention the second input of the MatMul node will be a pre-initialized weight tensor, which we can view using the following:"
+    "from qonnx.util.cleanup import cleanup\n",
+    "\n",
+    "export_onnx_path_cleaned = \"/tmp/LFCW1A1-qonnx-clean.onnx\"\n",
+    "cleanup(export_onnx_path, out_file=export_onnx_path_cleaned)"
    ]
   },
   {
@@ -199,14 +198,14 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "model.get_initializer(model.graph.node[8].input[1])"
+    "showInNetron(export_onnx_path_cleaned)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "We can also examine the quantization annotations and shapes of various tensors using the convenience functions provided by ModelWrapper."
+    "We will now import this QONNX model into FINN using the ModelWrapper. Here we can immediatley execute the model to verify correctness."
    ]
   },
   {
@@ -215,7 +214,14 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "model.get_tensor_datatype(model.graph.node[8].input[1]).name"
+    "from qonnx.core.modelwrapper import ModelWrapper\n",
+    "import qonnx.core.onnx_exec as oxe\n",
+    "model = ModelWrapper(export_onnx_path_cleaned)\n",
+    "input_dict = {\"global_in\": nph.to_array(input_tensor)}\n",
+    "output_dict = oxe.execute_onnx(model, input_dict)\n",
+    "produced_qonnx = output_dict[list(output_dict.keys())[0]]\n",
+    "\n",
+    "produced_qonnx"
    ]
   },
   {
@@ -224,14 +230,14 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "model.get_tensor_shape(model.graph.node[8].input[1])"
+    "np.isclose(produced, produced_qonnx).all()"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "If we want to operate further on this model in FINN, it is a good idea to execute certain \"cleanup\" transformations on this graph. Here, we will run shape inference and constant folding on this graph, and visualize the resulting graph in Netron again."
+    "Using the `QONNXtoFINN` transformation we can convert the model to the FINN internal FINN-ONNX representation. Notably all Quant and BipolarQuant nodes will have disappeared and are converted into MultiThreshold nodes."
    ]
   },
   {
@@ -240,12 +246,13 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from qonnx.transformation.fold_constants import FoldConstants\n",
-    "from qonnx.transformation.infer_shapes import InferShapes\n",
-    "model = model.transform(InferShapes())\n",
-    "model = model.transform(FoldConstants())\n",
-    "export_onnx_path_transformed = \"/tmp/LFCW1A1-clean.onnx\"\n",
-    "model.save(export_onnx_path_transformed)"
+    "from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN\n",
+    "model = ModelWrapper(export_onnx_path_cleaned)\n",
+    "\n",
+    "model = model.transform(ConvertQONNXtoFINN())\n",
+    "\n",
+    "export_onnx_path_converted = \"/tmp/LFCW1A1-qonnx-converted.onnx\"\n",
+    "model.save(export_onnx_path_converted)"
    ]
   },
   {
@@ -254,14 +261,14 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "showInNetron('/tmp/LFCW1A1-clean.onnx')"
+    "showInNetron(export_onnx_path_converted)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "We can see that the resulting graph has become smaller and simpler. Specifically, the input reshaping is now a single Reshape node instead of the Shape -> Gather -> Unsqueeze -> Concat -> Reshape sequence. We can now use the internal ONNX execution capabilities of FINN to ensure that we still get the same output from this model as we did with PyTorch."
+    "And once again we can execute the model with the FINN/QONNX execution engine."
    ]
   },
   {
@@ -270,8 +277,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import finn.core.onnx_exec as oxe\n",
-    "input_dict = {\"0\": nph.to_array(input_tensor)}\n",
+    "model = ModelWrapper(export_onnx_path_cleaned)\n",
+    "input_dict = {\"global_in\": nph.to_array(input_tensor)}\n",
     "output_dict = oxe.execute_onnx(model, input_dict)\n",
     "produced_finn = output_dict[list(output_dict.keys())[0]]\n",
     "\n",
@@ -284,7 +291,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "np.isclose(produced, produced_finn).all()"
+    "np.isclose(produced_qonnx, produced_finn).all()"
    ]
   },
   {
@@ -297,7 +304,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -311,7 +318,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.5"
+   "version": "3.10.6"
   }
  },
  "nbformat": 4,
diff --git a/notebooks/end2end_example/bnn-pynq/cnv_end2end_example.ipynb b/notebooks/end2end_example/bnn-pynq/cnv_end2end_example.ipynb
index a2747e3921..9e9d52e476 100644
--- a/notebooks/end2end_example/bnn-pynq/cnv_end2end_example.ipynb
+++ b/notebooks/end2end_example/bnn-pynq/cnv_end2end_example.ipynb
@@ -46,7 +46,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "The white fields show the state of the network representation in the respective step. The colored fields represent the transformations that are applied to the network to achieve a certain result. The diagram is divided into 5 sections represented by a different color, each of it includes several flow steps. The flow starts in top left corner with Brevitas export (green section), followed by the preparation of the network (blue section) for the Vivado HLS synthesis and Vivado IPI stitching (orange section), and finally building a PYNQ overlay bitfile and testing it on a PYNQ board (yellow section).\n",
+    "The white fields show the state of the network representation in the respective step. The colored fields represent the transformations that are applied to the network to achieve a certain result. The diagram is divided into 5 sections represented by a different color, each of it includes several flow steps. The flow starts in top left corner with Brevitas export (green section), followed by the preparation of the network (blue section) for the Vitis HLS synthesis and Vivado IPI stitching (orange section), and finally building a PYNQ overlay bitfile and testing it on a PYNQ board (yellow section).\n",
     "There is an additional section for functional verification (red section) on the left side of the diagram, which we will not cover in this notebook. For details please take a look in the verification notebook which you can find [here](tfc_end2end_verification.ipynb)\n",
     "\n",
     "\n",
@@ -72,7 +72,7 @@
    "source": [
     "## 1. Brevitas Export, FINN Import and Tidy-Up\n",
     "\n",
-    "Similar to what we did in the TFC-w1a1 end-to-end notebook, we will start by exporting the [pretrained CNV-w1a1 network](https://github.com/Xilinx/brevitas/tree/master/src/brevitas_examples/bnn_pynq) to ONNX, importing that into FINN and running the \"tidy-up\" transformations to have a first look at the topology."
+    "Similar to what we did in the TFC-w1a1 end-to-end notebook, we will start by exporting the [pretrained CNV-w1a1 network](https://github.com/Xilinx/brevitas/tree/master/src/brevitas_examples/bnn_pynq) to ONNX, importing that into FINN and running the \"tidy-up\" transformations to have a first look at the topology. The network will be exported in QONNX format and then converted into the FINN-ONNX format to prepare it for the FINN compiler."
    ]
   },
   {
@@ -81,17 +81,23 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "import torch\n",
     "import onnx\n",
     "from finn.util.test import get_test_model_trained\n",
-    "import brevitas.onnx as bo\n",
+    "from brevitas.export import export_qonnx\n",
+    "from qonnx.util.cleanup import cleanup as qonnx_cleanup\n",
     "from qonnx.core.modelwrapper import ModelWrapper\n",
+    "from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN\n",
     "from qonnx.transformation.infer_shapes import InferShapes\n",
     "from qonnx.transformation.fold_constants import FoldConstants\n",
     "from qonnx.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames, RemoveStaticGraphInputs\n",
     "\n",
     "cnv = get_test_model_trained(\"CNV\", 1, 1)\n",
-    "bo.export_finn_onnx(cnv, (1, 3, 32, 32), build_dir + \"/end2end_cnv_w1a1_export.onnx\")\n",
-    "model = ModelWrapper(build_dir + \"/end2end_cnv_w1a1_export.onnx\")\n",
+    "export_onnx_path = build_dir + \"/end2end_cnv_w1a1_export.onnx\"\n",
+    "export_qonnx(cnv, torch.randn(1, 3, 32, 32), export_onnx_path)\n",
+    "qonnx_cleanup(export_onnx_path, out_file=export_onnx_path)\n",
+    "model = ModelWrapper(export_onnx_path)\n",
+    "model = model.transform(ConvertQONNXtoFINN())\n",
     "model = model.transform(InferShapes())\n",
     "model = model.transform(FoldConstants())\n",
     "model = model.transform(GiveUniqueNodeNames())\n",
@@ -148,10 +154,12 @@
     "# preprocessing: torchvision's ToTensor divides uint8 inputs by 255\n",
     "totensor_pyt = ToTensor()\n",
     "chkpt_preproc_name = build_dir+\"/end2end_cnv_w1a1_preproc.onnx\"\n",
-    "bo.export_finn_onnx(totensor_pyt, ishape, chkpt_preproc_name)\n",
+    "export_qonnx(totensor_pyt, torch.randn(ishape), chkpt_preproc_name)\n",
+    "qonnx_cleanup(chkpt_preproc_name, out_file=chkpt_preproc_name)\n",
+    "pre_model = ModelWrapper(chkpt_preproc_name)\n",
+    "pre_model = pre_model.transform(ConvertQONNXtoFINN())\n",
     "\n",
     "# join preprocessing and core model\n",
-    "pre_model = ModelWrapper(chkpt_preproc_name)\n",
     "model = model.transform(MergeONNXModels(pre_model))\n",
     "# add input quantization annotation: UINT8 for all BNN-PYNQ models\n",
     "global_inp_name = model.graph.input[0].name\n",
@@ -199,7 +207,7 @@
     "\n",
     "![](cnv-mp-fc.png)\n",
     "\n",
-    "Note how the convolution layer looks very similar to the fully connected one in terms of the matrix-vector-threshold unit (MVTU), but now the MVTU is preceded by a sliding window unit that produces the matrix from the input image. All of these building blocks, including the `MaxPool` layer you see in this figure, exist as templated Vivado HLS C++ functions in [finn-hlslib](https://github.com/Xilinx/finn-hlslib).\n",
+    "Note how the convolution layer looks very similar to the fully connected one in terms of the matrix-vector-threshold unit (MVTU), but now the MVTU is preceded by a sliding window unit that produces the matrix from the input image. All of these building blocks, including the `MaxPool` layer you see in this figure, exist as templated Vitis HLS C++ functions in [finn-hlslib](https://github.com/Xilinx/finn-hlslib).\n",
     "\n",
     "\n",
     "To target this kind of hardware architecture with our network we'll apply a convolution lowering transformation, in addition to streamlining. You may recall the *streamlining transformation* that we applied to the TFC-w1a1 network, which is a series of mathematical simplifications that allow us to get rid of floating point scaling operations by implementing few-bit activations as thresholding operations. \n",
@@ -240,7 +248,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "We won't go into too much detail about what happens in each transformation and why they are called in the particular order they are (feel free to visualize the intermediate steps using Netron yourself if you are curious) but here is a brief summmmary:\n",
+    "We won't go into too much detail about what happens in each transformation and why they are called in the particular order they are (feel free to visualize the intermediate steps using Netron yourself if you are curious) but here is a brief summary:\n",
     "\n",
     "* `Streamline` moves floating point scaling and addition operations closer to the input of the nearest thresholding activation and absorbs them into thresholds\n",
     "* `LowerConvsToMatMul` converts ONNX `Conv` nodes into sequences of `Im2Col, MatMul` nodes as discussed above. `Im2Col` is a custom FINN ONNX high-level node type that implements the sliding window operator.\n",
@@ -359,21 +367,21 @@
     "fc_layers = model.get_nodes_by_op_type(\"MatrixVectorActivation\")\n",
     "# each tuple is (PE, SIMD, in_fifo_depth) for a layer\n",
     "folding = [\n",
-    "    (16, 3, 128),\n",
-    "    (32, 32, 128),\n",
-    "    (16, 32, 128),\n",
-    "    (16, 32, 128),\n",
-    "    (4, 32, 81),\n",
-    "    (1, 32, 2),\n",
-    "    (1, 4, 2),\n",
-    "    (1, 8, 128),\n",
-    "    (5, 1, 3),\n",
+    "    (16, 3, [128]),\n",
+    "    (32, 32, [128]),\n",
+    "    (16, 32, [128]),\n",
+    "    (16, 32, [128]),\n",
+    "    (4, 32, [81]),\n",
+    "    (1, 32, [2]),\n",
+    "    (1, 4, [2]),\n",
+    "    (1, 8, [128]),\n",
+    "    (5, 1, [3]),\n",
     "]\n",
     "for fcl, (pe, simd, ififodepth) in zip(fc_layers, folding):\n",
     "    fcl_inst = getCustomOp(fcl)\n",
     "    fcl_inst.set_nodeattr(\"PE\", pe)\n",
     "    fcl_inst.set_nodeattr(\"SIMD\", simd)\n",
-    "    fcl_inst.set_nodeattr(\"inFIFODepth\", ififodepth)\n",
+    "    fcl_inst.set_nodeattr(\"inFIFODepths\", ififodepth)\n",
     "\n",
     "# use same SIMD values for the sliding window operators\n",
     "swg_layers = model.get_nodes_by_op_type(\"ConvolutionInputGenerator\")\n",
@@ -462,11 +470,9 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## 5. Deployment and Remote Execution\n",
+    "## 5. Deployment and Execution\n",
     "\n",
-    "Now that we're done with the hardware generation, we can copy the necessary files onto our PYNQ board.\n",
-    "\n",
-    "**Make sure you've [set up the SSH keys for your PYNQ board](https://finn-dev.readthedocs.io/en/latest/getting_started.html#pynq-board-first-time-setup) before executing this step.**"
+    "The bitfile and generated driver files(s) will be copied into a deployment folder which then can be used to run the network on the PYNQ board."
    ]
   },
   {
@@ -475,33 +481,33 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import os\n",
+    "from shutil import copy\n",
+    "from distutils.dir_util import copy_tree\n",
     "\n",
-    "# set up the following values according to your own environment\n",
-    "# FINN will use ssh to deploy and run the generated accelerator\n",
-    "ip = \"192.168.2.99\"\n",
-    "username = os.getenv(\"PYNQ_USERNAME\", \"xilinx\")\n",
-    "password = os.getenv(\"PYNQ_PASSWORD\", \"xilinx\")\n",
-    "port = os.getenv(\"PYNQ_PORT\", 22)\n",
-    "target_dir = os.getenv(\"PYNQ_TARGET_DIR\", \"/home/xilinx/finn_cnv_end2end_example\")\n",
-    "# set up ssh options to only allow publickey authentication\n",
-    "options = \"-o PreferredAuthentications=publickey -o PasswordAuthentication=no\"\n",
+    "# create directory for deployment files\n",
+    "deployment_dir = make_build_dir(prefix=\"pynq_deployment_\")\n",
+    "model.set_metadata_prop(\"pynq_deployment_dir\", deployment_dir)\n",
     "\n",
-    "# test access to PYNQ board\n",
-    "! ssh {options} {username}@{ip} -p {port} cat /var/run/motd.dynamic"
+    "# get and copy necessary files\n",
+    "# .bit and .hwh file\n",
+    "bitfile = model.get_metadata_prop(\"bitfile\")\n",
+    "hwh_file = model.get_metadata_prop(\"hw_handoff\")\n",
+    "deploy_files = [bitfile, hwh_file]\n",
+    "\n",
+    "for dfile in deploy_files:\n",
+    "    if dfile is not None:\n",
+    "        copy(dfile, deployment_dir)\n",
+    "\n",
+    "# driver.py and python libraries\n",
+    "pynq_driver_dir = model.get_metadata_prop(\"pynq_driver_dir\")\n",
+    "copy_tree(pynq_driver_dir, deployment_dir)"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
+   "cell_type": "markdown",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "from finn.transformation.fpgadataflow.make_deployment import DeployToPYNQ\n",
-    "\n",
-    "model = ModelWrapper(build_dir + \"/end2end_cnv_w1a1_synth.onnx\")\n",
-    "model = model.transform(DeployToPYNQ(ip, port, username, password, target_dir))\n",
-    "model.save(build_dir + \"/end2end_cnv_w1a1_pynq_deploy.onnx\")"
+    "Next to these files, we will also need an example numpy array to test the network on the PYNQ board. (*and before you ask, that's supposed to be a cat (CIFAR-10 class number 3)*) Recall that we partitioned our original network into a parent graph that contained the non-synthesizable nodes and a child graph that contained the bulk of the network, which we turned into a bitfile. The only operator left outside the FPGA partition was a `Transpose` to convert NCHW images into NHWC ones. Thus, we can skip the execution in the parent as long as we ensure our image has the expected data layout. The example numpy array can then be saved as .npy file."
    ]
   },
   {
@@ -510,8 +516,15 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "target_dir_pynq = target_dir + \"/\" + model.get_metadata_prop(\"pynq_deployment_dir\").split(\"/\")[-1]\n",
-    "target_dir_pynq"
+    "import importlib_resources\n",
+    "import matplotlib.pyplot as plt\n",
+    "import numpy as np\n",
+    "\n",
+    "ref = importlib_resources.files(\"finn.qnn-data\") / \"cifar10/cifar10-test-data-class3.npz\"\n",
+    "with importlib_resources.as_file(ref) as fn:\n",
+    "    x = np.load(fn)[\"arr_0\"]\n",
+    "x = x.reshape(3, 32,32).transpose(1, 2, 0)\n",
+    "plt.imshow(x)"
    ]
   },
   {
@@ -520,14 +533,19 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "! ssh {options} {username}@{ip} -p {port} 'ls -l {target_dir_pynq}'"
+    "model = ModelWrapper(build_dir + \"/end2end_cnv_w1a1_synth.onnx\")\n",
+    "iname = model.graph.input[0].name\n",
+    "ishape = model.get_tensor_shape(iname)\n",
+    "np.save(deployment_dir + \"/input.npy\", x.reshape(ishape))"
    ]
   },
   {
-   "cell_type": "markdown",
+   "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
+   "outputs": [],
    "source": [
-    "We only have two more steps to be able to remotely execute the deployed bitfile with some test data from the CIFAR-10 dataset. Let's load up some test data that comes bundled with FINN -- *and before you ask, that's supposed to be a cat (CIFAR-10 class number 3)*."
+    "! ls {deployment_dir}"
    ]
   },
   {
@@ -536,54 +554,34 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import pkg_resources as pk\n",
-    "import matplotlib.pyplot as plt\n",
-    "import numpy as np\n",
-    "\n",
-    "fn = pk.resource_filename(\"finn.qnn-data\", \"cifar10/cifar10-test-data-class3.npz\")\n",
-    "x = np.load(fn)[\"arr_0\"]\n",
-    "x = x.reshape(3, 32,32).transpose(1, 2, 0)\n",
-    "plt.imshow(x)"
+    "from shutil import make_archive\n",
+    "make_archive('deploy-on-pynq-cnv', 'zip', deployment_dir)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Recall that we partitioned our original network into a parent graph that contained the non-synthesizable nodes and a child graph that contained the bulk of the network, which we turned into a bitfile. The only operator left outside the FPGA partition was a `Transpose` to convert NCHW images into NHWC ones. Thus, we can skip the execution in the parent as long as we ensure our image has the expected data layout, which we have done above."
+    "You can now download the created zipfile (File -> Open, mark the checkbox next to the deploy-on-pynq-tfc.zip and select Download from the toolbar), then copy it to your PYNQ board (for instance via scp or rsync). Then, run the following commands on the PYNQ board to extract the archive and run the execution:"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import numpy as np\n",
-    "from finn.core.onnx_exec import execute_onnx\n",
-    "\n",
-    "model = ModelWrapper(build_dir + \"/end2end_cnv_w1a1_pynq_deploy.onnx\")\n",
-    "iname = model.graph.input[0].name\n",
-    "oname = model.graph.output[0].name\n",
-    "ishape = model.get_tensor_shape(iname)\n",
-    "input_dict = {iname: x.astype(np.float32).reshape(ishape)}\n",
-    "ret = execute_onnx(model, input_dict, True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
+   "cell_type": "markdown",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "ret[oname]"
+    "```shell\n",
+    "unzip deploy-on-pynq-cnv.zip -d finn-cnv-demo\n",
+    "cd finn-cnv-demo\n",
+    "sudo python3 -m pip install bitstring\n",
+    "sudo python3 driver.py --exec_mode=execute --batchsize=1 --bitfile=resizer.bit --inputfile=input.npy\n",
+    "```"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "We see that the network correctly predicts this as a class 3 (\"cat\"). "
+    "The output will be saved on the PYNQ board as `output.npy` and can be copied to the host and opened with `np.load()`."
    ]
   },
   {
@@ -592,7 +590,7 @@
    "source": [
     "### Validating the Accuracy on a PYNQ Board <a id='validation'></a>\n",
     "\n",
-    "All the command line prompts here are meant to be executed with `sudo` on the PYNQ board, so we'll use a workaround (`echo password | sudo -S command`) to get that working from this notebook running on the host computer.\n",
+    "All the command line prompts here are meant to be executed with `sudo` on the PYNQ board.\n",
     "\n",
     "**Ensure that your PYNQ board has a working internet connecting for the next steps, since some there is some downloading involved.**\n",
     "\n",
@@ -601,16 +599,9 @@
     "\n",
     "Command to execute on PYNQ:\n",
     "\n",
-    "```pip3 install git+https://github.com/fbcotter/dataset_loading.git@0.0.4#egg=dataset_loading```"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "! ssh {options} -t {username}@{ip} -p {port} 'echo {password} | sudo -S pip3 install git+https://github.com/fbcotter/dataset_loading.git@0.0.4#egg=dataset_loading'"
+    "```shell\n",
+    "sudo pip3 install git+https://github.com/fbcotter/dataset_loading.git@0.0.4#egg=dataset_loading\n",
+    "```"
    ]
   },
   {
@@ -621,16 +612,9 @@
     "\n",
     "Command to execute on PYNQ:\n",
     "\n",
-    "`python3.6 validate.py --dataset cifar10 --batchsize 1000`"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "! ssh {options} -t {username}@{ip} -p {port} 'cd {target_dir_pynq}; echo {password} | sudo -S python3.6 validate.py --dataset cifar10 --batchsize 1000'"
+    "```shell\n",
+    "sudo python3 validate.py --dataset cifar10 --batchsize 1000\n",
+    "```"
    ]
   },
   {
@@ -643,7 +627,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -657,9 +641,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.5"
+   "version": "3.10.12"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
 }
diff --git a/notebooks/end2end_example/bnn-pynq/tfc_end2end_example.ipynb b/notebooks/end2end_example/bnn-pynq/tfc_end2end_example.ipynb
index a6f05df309..a5c97328a5 100644
--- a/notebooks/end2end_example/bnn-pynq/tfc_end2end_example.ipynb
+++ b/notebooks/end2end_example/bnn-pynq/tfc_end2end_example.ipynb
@@ -33,7 +33,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "The white fields show the state of the network representation in the respective step. The colored fields represent the transformations that are applied to the network to achieve a certain result. The diagram is divided into 5 sections represented by a different color, each of it includes several flow steps. The flow starts in top left corner with Brevitas export (green section), followed by the preparation of the network (blue section) for the Vivado HLS synthesis and Vivado IPI stitching (orange section), and finally building a PYNQ overlay bitfile and testing it on a PYNQ board (yellow section).\n",
+    "The white fields show the state of the network representation in the respective step. The colored fields represent the transformations that are applied to the network to achieve a certain result. The diagram is divided into 5 sections represented by a different color, each of it includes several flow steps. The flow starts in top left corner with Brevitas export (green section), followed by the preparation of the network (blue section) for the Vitis HLS synthesis and Vivado IPI stitching (orange section), and finally building a PYNQ overlay bitfile and testing it on a PYNQ board (yellow section).\n",
     "There is an additional section for functional verification (red section) on the right side of the diagram, which we will not cover in this notebook. For details please take a look in the verification notebook which you can find [here](tfc_end2end_verification.ipynb)\n",
     "\n",
     "\n",
@@ -81,19 +81,23 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "import torch\n",
     "import onnx\n",
     "from finn.util.test import get_test_model_trained\n",
-    "import brevitas.onnx as bo\n",
+    "from brevitas.export import export_qonnx\n",
+    "from qonnx.util.cleanup import cleanup as qonnx_cleanup\n",
     "\n",
     "tfc = get_test_model_trained(\"TFC\", 1, 1)\n",
-    "bo.export_finn_onnx(tfc, (1, 1, 28, 28), build_dir+\"/tfc_w1_a1.onnx\"); # semicolon added to suppress log"
+    "export_onnx_path = build_dir+\"/tfc_w1_a1.onnx\"\n",
+    "export_qonnx(tfc, torch.randn(1, 1, 28, 28), build_dir+\"/tfc_w1_a1.onnx\"); # semicolon added to suppress log\n",
+    "qonnx_cleanup(export_onnx_path, out_file=export_onnx_path)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "The model was now exported, loaded with the pretrained weights and saved under the name \"tfc_w1_a1.onnx\".\n",
+    "The model was now exported in QONNX format, loaded with the pretrained weights and saved under the name \"tfc_w1_a1.onnx\".\n",
     "To visualize the exported model, Netron can be used. Netron is a visualizer for neural networks and allows interactive investigation of network properties. For example, you can click on the individual nodes and view the properties."
    ]
   },
@@ -110,7 +114,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Now that we have the model in .onnx format, we can work with it using FINN. For that, `ModelWrapper` is used. It is a wrapper around the ONNX model which provides several helper functions to make it easier to work with the model. 'ModelWrapper' is imported from the [QONNX repo](https://github.com/fastmachinelearning/qonnx), this repository contains several functionality that is used in FINN."
+    "Now that we have the model in .onnx format, we can work with it using FINN. For that, `ModelWrapper` is used. It is a wrapper around the ONNX model which provides several helper functions to make it easier to work with the model. 'ModelWrapper' is imported from the [QONNX repo](https://github.com/fastmachinelearning/qonnx), this repository contains several functionality that is used in FINN. The model was exported in QONNX format, to feed it into the FINN flow, our first step is to convert it to the FINN-ONNX format."
    ]
   },
   {
@@ -120,7 +124,9 @@
    "outputs": [],
    "source": [
     "from qonnx.core.modelwrapper import ModelWrapper\n",
-    "model = ModelWrapper(build_dir+\"/tfc_w1_a1.onnx\")"
+    "from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN\n",
+    "model = ModelWrapper(build_dir+\"/tfc_w1_a1.onnx\")\n",
+    "model = model.transform(ConvertQONNXtoFINN())"
    ]
   },
   {
@@ -161,7 +167,7 @@
     "\n",
     "![](finn-hw-arch.png)\n",
     "\n",
-    "In practice, the compute arrays are instantiated by function calls to optimized Vivado HLS building blocks from the [finn-hlslib](https://github.com/Xilinx/finn-hlslib) library. As these function calls can only handle certain patterns/cases, we need to transform the network into an appropriate form so that we can replace network layers with these function calls, which is the goal of the network preparation process."
+    "In practice, the compute arrays are instantiated by function calls to optimized Vitis HLS building blocks from the [finn-hlslib](https://github.com/Xilinx/finn-hlslib) library. As these function calls can only handle certain patterns/cases, we need to transform the network into an appropriate form so that we can replace network layers with these function calls, which is the goal of the network preparation process."
    ]
   },
   {
@@ -248,7 +254,7 @@
     "\n",
     "In FINN, we can bake some of these pre/postprocessing operatings into the graph, and in some cases these can be highly beneficial for performance by allowing our accelerator to directly consume raw data instead of going through CPU preprocessing. \n",
     "\n",
-    "We'll demonstrate this for our small image classification network as follows. Brevitas preprocesses BNN-PYNQ network inputs with `torchvision.transforms.ToTensor()` [prior to training](https://github.com/Xilinx/brevitas/blob/master/src/brevitas_examples/bnn_pynq/trainer.py#L104), which converts 8-bit RGB values into floats between 0 and 1 by dividing the input by 255. We can achieve the same effect in FINN by exporting a single-node ONNX graph for division by 255 (which already exists as `finn.util.pytorch.ToTensor` and merging this with our original model. Finally, we're going to mark our input tensor as 8-bit to let FINN know which level of precision to use."
+    "We'll demonstrate this for our small image classification network as follows. Brevitas preprocesses BNN-PYNQ network inputs with `torchvision.transforms.ToTensor()` [prior to training](https://github.com/Xilinx/brevitas/blob/master/src/brevitas_examples/bnn_pynq/trainer.py#L86), which converts 8-bit RGB values into floats between 0 and 1 by dividing the input by 255. We can achieve the same effect in FINN by exporting a single-node ONNX graph for division by 255 (which already exists as `finn.util.pytorch.ToTensor` and merging this with our original model. Finally, we're going to mark our input tensor as 8-bit to let FINN know which level of precision to use."
    ]
   },
   {
@@ -267,10 +273,12 @@
     "# preprocessing: torchvision's ToTensor divides uint8 inputs by 255\n",
     "totensor_pyt = ToTensor()\n",
     "chkpt_preproc_name = build_dir+\"/tfc_w1_a1_preproc.onnx\"\n",
-    "bo.export_finn_onnx(totensor_pyt, ishape, chkpt_preproc_name)\n",
+    "export_qonnx(totensor_pyt, torch.randn(ishape), chkpt_preproc_name)\n",
+    "qonnx_cleanup(chkpt_preproc_name, out_file=chkpt_preproc_name)\n",
+    "pre_model = ModelWrapper(chkpt_preproc_name)\n",
+    "pre_model = pre_model.transform(ConvertQONNXtoFINN())\n",
     "\n",
     "# join preprocessing and core model\n",
-    "pre_model = ModelWrapper(chkpt_preproc_name)\n",
     "model = model.transform(MergeONNXModels(pre_model))\n",
     "# add input quantization annotation: UINT8 for all BNN-PYNQ models\n",
     "global_inp_name = model.graph.input[0].name\n",
@@ -343,7 +351,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "As can be seen, several transformations are involved in the streamlining transformation. There are move and collapse transformations. In the last step the operations are transformed into multithresholds. The involved transformations can be viewed in detail [here](https://github.com/Xilinx/finn/tree/master/src/finn/transformation/streamline). After each transformation, three of the tidy-up transformations (`GiveUniqueNodeNames`, `GiveReadableTensorNames` and `InferDataTypes`) are applied to the model.\n",
+    "As can be seen, several transformations are involved in the streamlining transformation. There are move and collapse transformations. In the last step the operations are transformed into multithresholds. The involved transformations can be viewed in detail [here](https://github.com/Xilinx/finn/tree/main/src/finn/transformation/streamline). After each transformation, three of the tidy-up transformations (`GiveUniqueNodeNames`, `GiveReadableTensorNames` and `InferDataTypes`) are applied to the model.\n",
     "\n",
     "After streamlining the network looks as follows:"
    ]
@@ -525,7 +533,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "We can use the higher-level [HLSCustomOp](https://github.com/Xilinx/finn/blob/main/src/finn/custom_op/fpgadataflow/__init__.py) wrappers for this node. These wrappers provide easy access to specific properties of these nodes, such as the folding factors (PE and SIMD). Let's have a look at which node attributes are defined by the CustomOp wrapper, and adjust the SIMD and PE attributes."
+    "We can use the higher-level [HLSCustomOp](https://github.com/Xilinx/finn/blob/main/src/finn/custom_op/fpgadataflow/hlscustomop.py) wrappers for this node. These wrappers provide easy access to specific properties of these nodes, such as the folding factors (PE and SIMD). Let's have a look at which node attributes are defined by the CustomOp wrapper, and adjust the SIMD and PE attributes."
    ]
   },
   {
@@ -547,7 +555,7 @@
    "metadata": {},
    "source": [
     "We can see that the PE and SIMD are listed as node attributes, as well as the depths of the FIFOs that will be inserted between consecutive layers, and all can be adjusted using `set_nodeattr` subject to certain constraints. There are also a lot of additional attributes that can be set for this node type.\n",
-    "**In this notebook we are setting the folding factors and FIFO depths manually, but in a future version we will support determining the folding factors given an FPGA resource budget according to the analytical model from the [FINN-R paper](https://arxiv.org/pdf/1809.04570).**"
+    "**In this notebook we are setting the folding factors and FIFO depths manually but it is possible to use FINN transformations for this ([SetFolding](https://github.com/Xilinx/finn/blob/main/src/finn/transformation/fpgadataflow/set_folding.py) and [InsertAndSetFIFODepths](https://github.com/Xilinx/finn/blob/main/src/finn/transformation/fpgadataflow/set_fifo_depths.py)).**"
    ]
   },
   {
@@ -559,17 +567,17 @@
     "fc_layers = model.get_nodes_by_op_type(\"MatrixVectorActivation\")\n",
     "# (PE, SIMD, in_fifo_depth, out_fifo_depth, ramstyle) for each layer\n",
     "config = [\n",
-    "    (16, 49, 16, 64, \"block\"),\n",
-    "    (8, 8, 64, 64, \"auto\"),\n",
-    "    (8, 8, 64, 64, \"auto\"),\n",
-    "    (10, 8, 64, 10, \"distributed\"),\n",
+    "    (16, 49, [16], [64], \"block\"),\n",
+    "    (8, 8, [64], [64], \"auto\"),\n",
+    "    (8, 8, [64], [64], \"auto\"),\n",
+    "    (10, 8, [64], [10], \"distributed\"),\n",
     "]\n",
     "for fcl, (pe, simd, ififo, ofifo, ramstyle) in zip(fc_layers, config):\n",
     "    fcl_inst = getCustomOp(fcl)\n",
     "    fcl_inst.set_nodeattr(\"PE\", pe)\n",
     "    fcl_inst.set_nodeattr(\"SIMD\", simd)\n",
-    "    fcl_inst.set_nodeattr(\"inFIFODepth\", ififo)\n",
-    "    fcl_inst.set_nodeattr(\"outFIFODepth\", ofifo)\n",
+    "    fcl_inst.set_nodeattr(\"inFIFODepths\", ififo)\n",
+    "    fcl_inst.set_nodeattr(\"outFIFODepths\", ofifo)\n",
     "    fcl_inst.set_nodeattr(\"ram_style\", ramstyle)\n",
     "    \n",
     "# set parallelism for input quantizer to be same as first layer's SIMD\n",
@@ -590,7 +598,7 @@
    "metadata": {},
    "source": [
     "Besides PE and SIMD three other node attributes are set. `ram_style` specifies how the weights are to be stored (BRAM, LUTRAM, and so on). It can be selected explicitly or with the option `auto` you can let Vivado decide.\n",
-    "`inFIFODepth` and `outFIFODepth` specifies the FIFO depths that is needed by the node from the surrounding FIFOs. These attributes are used in the transformation 'InsertFIFO' to insert the appropriate FIFOs between the nodes, which will be automatically called as part of the hardware build process.\n",
+    "`inFIFODepths` and `outFIFODepths` specifies the FIFO depths that is needed by the node from the surrounding FIFOs. These attributes are used in the transformation 'InsertFIFO' to insert the appropriate FIFOs between the nodes, which will be automatically called as part of the hardware build process.\n",
     "\n",
     "In previous versions of FINN we had to call transformations to insert data width converters, FIFOs and `TLastMarker` manually at this step. This is no longer needed, as all this is taken care of by the `ZynqBuild` or `VitisBuild` transformations."
    ]
@@ -609,7 +617,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "This completes the network preparation and the network can be passed on to the next block *Vivado HLS and IPI*, which is described below."
+    "This completes the network preparation and the network can be passed on to the next block *Vitis HLS and IPI*, which is described below."
    ]
   },
   {
@@ -798,23 +806,21 @@
    "source": [
     "## 4.  PYNQ deployment <a id='hw_test'></a>\n",
     "\n",
-    "* [Deployment and Remote Execution](#deploy)\n",
+    "* [Deployment](#deploy)\n",
     "* [Validation on PYNQ Board](#validation)\n",
     "* [Throughput Test on PYNQ Board](#throughput)\n",
     "\n",
     "\n",
-    "We are almost done preparing our hardware design. We'll now put it in a form suitable for use as a PYNQ overlay, synthesize and deploy it."
+    "The bitfile and generated driver will be copied together with some necessary files for execution into a deployment folder which then can be used to run the network on the PYNQ board."
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Deployment and Remote Execution <a id='deploy'></a>\n",
+    "### Deployment <a id='deploy'></a>\n",
     "\n",
-    "We'll now use the `DeployToPYNQ` transformation to create a deployment folder with the bitfile and driver file(s), and copy that to the PYNQ board. You can change the default IP address, username, password and target folder for the PYNQ below.\n",
-    "\n",
-    "**Make sure you've [set up the SSH keys for your PYNQ board](https://finn-dev.readthedocs.io/en/latest/getting_started.html#pynq-board-first-time-setup) before executing this step.**"
+    "We'll now create a deployment folder with the bitfile and driver file(s), we zip it and afterwards it can be copied to the PYNQ board for execution and validation."
    ]
   },
   {
@@ -823,74 +829,33 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import os\n",
+    "from shutil import copy\n",
+    "from distutils.dir_util import copy_tree\n",
     "\n",
-    "# set up the following values according to your own environment\n",
-    "# FINN will use ssh to deploy and run the generated accelerator\n",
-    "ip = \"192.168.2.99\"\n",
-    "username = os.getenv(\"PYNQ_USERNAME\", \"xilinx\")\n",
-    "password = os.getenv(\"PYNQ_PASSWORD\", \"xilinx\")\n",
-    "port = os.getenv(\"PYNQ_PORT\", 22)\n",
-    "target_dir = os.getenv(\"PYNQ_TARGET_DIR\", \"/home/xilinx/finn_tfc_end2end_example\")\n",
-    "# set up ssh options to only allow publickey authentication\n",
-    "options = \"-o PreferredAuthentications=publickey -o PasswordAuthentication=no\"\n",
+    "# create directory for deployment files\n",
+    "deployment_dir = make_build_dir(prefix=\"pynq_deployment_\")\n",
+    "model.set_metadata_prop(\"pynq_deployment_dir\", deployment_dir)\n",
     "\n",
-    "# test access to PYNQ board\n",
-    "! ssh {options} {username}@{ip} -p {port} cat /var/run/motd.dynamic"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from finn.transformation.fpgadataflow.make_deployment import DeployToPYNQ\n",
+    "# get and copy necessary files\n",
+    "# .bit and .hwh file\n",
+    "bitfile = model.get_metadata_prop(\"bitfile\")\n",
+    "hwh_file = model.get_metadata_prop(\"hw_handoff\")\n",
+    "deploy_files = [bitfile, hwh_file]\n",
     "\n",
-    "model = model.transform(DeployToPYNQ(ip, port, username, password, target_dir))\n",
-    "model.save(build_dir + \"/tfc_w1_a1_pynq_deploy.onnx\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Let's verify that the remote access credentials is saved in the model metadata, and that the deployment folder has been successfully copied to the board:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "model.model.metadata_props"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "target_dir_pynq = target_dir + \"/\" + model.get_metadata_prop(\"pynq_deployment_dir\").split(\"/\")[-1]\n",
-    "target_dir_pynq"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "! ssh {options} {username}@{ip} -p {port} 'ls -l {target_dir_pynq}'"
+    "for dfile in deploy_files:\n",
+    "    if dfile is not None:\n",
+    "        copy(dfile, deployment_dir)\n",
+    "\n",
+    "# driver.py and python libraries\n",
+    "pynq_driver_dir = model.get_metadata_prop(\"pynq_driver_dir\")\n",
+    "copy_tree(pynq_driver_dir, deployment_dir)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "We only have two more steps to be able to remotely execute the deployed bitfile with some test data from the MNIST dataset. Let's load up some test data that comes bundled with FINN."
+    "Next to these files, we will also need an example numpy array to test the network on the PYNQ board. You may recall that one \"reshape\" node was left out of the StreamingDataflowPartition. We'll do that manually with a numpy function call when passing in the input, but everything else in the network ended up inside the StreamingDataflowPartition so that's all we need to do. The example numpy array can then be saved as .npy file. "
    ]
   },
   {
@@ -914,18 +879,23 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "model = ModelWrapper(build_dir + \"/tfc_w1_a1_pynq_deploy.onnx\")\n",
+    "import numpy as np\n",
+    "\n",
+    "model = ModelWrapper(build_dir + \"/tfc_w1_a1_post_synthesis.onnx\")\n",
     "iname = model.graph.input[0].name\n",
     "oname = parent_model.graph.output[0].name\n",
     "ishape = model.get_tensor_shape(iname)\n",
-    "print(\"Expected network input shape is \" + str(ishape))"
+    "print(\"Expected network input shape is \" + str(ishape))\n",
+    "np.save(deployment_dir + \"/input.npy\", x.reshape(ishape))"
    ]
   },
   {
-   "cell_type": "markdown",
+   "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
+   "outputs": [],
    "source": [
-    "Finally, we can call `execute_onnx` on the graph, which will internally call remote execution with the bitfile, grab the results and return a numpy array. You may recall that one \"reshape\" node was left out of the StreamingDataflowPartition. We'll do that manually with a numpy function call when passing in the input, but everything else in the network ended up inside the StreamingDataflowPartition so that's all we need to do."
+    "! ls {deployment_dir}"
    ]
   },
   {
@@ -934,27 +904,34 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import numpy as np\n",
-    "from finn.core.onnx_exec import execute_onnx\n",
-    "\n",
-    "input_dict = {iname: x.reshape(ishape)}\n",
-    "ret = execute_onnx(model, input_dict)"
+    "from shutil import make_archive\n",
+    "make_archive('deploy-on-pynq-tfc', 'zip', deployment_dir)"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You can now download the created zipfile (**File -> Open**, mark the checkbox next to the `deploy-on-pynq-tfc.zip` and select Download from the toolbar), then copy it to your PYNQ board (for instance via `scp` or `rsync`). Then, run the following commands **on the PYNQ board** to extract the archive and run the execution:"
+   ]
+  },
+  {
+   "cell_type": "markdown",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "ret[oname]"
+    "```shell\n",
+    "unzip deploy-on-pynq-tfc.zip -d finn-tfc-demo\n",
+    "cd finn-tfc-demo\n",
+    "sudo python3 -m pip install bitstring\n",
+    "sudo python3 driver.py --exec_mode=execute --batchsize=1 --bitfile=resizer.bit --inputfile=input.npy\n",
+    "```"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "We see that the network correctly predicts this as a digit 2."
+    "The output will be saved on the PYNQ board as `output.npy` and can be copied to the host and opened with `np.load()`."
    ]
   },
   {
@@ -963,25 +940,16 @@
    "source": [
     "### Validating the Accuracy on a PYNQ Board <a id='validation'></a>\n",
     "\n",
-    "All the command line prompts here are meant to be executed with `sudo` on the PYNQ board, so we'll use a workaround (`echo password | sudo -S command`) to get that working from this notebook running on the host computer.\n",
-    "\n",
     "**Ensure that your PYNQ board has a working internet connecting for the next steps, since there is some downloading involved.**\n",
     "\n",
     "To validate the accuracy, we first need to install the [`dataset-loading`](https://github.com/fbcotter/dataset_loading) Python package to the PYNQ board. This will give us a convenient way of downloading and accessing the MNIST dataset.\n",
     "\n",
     "\n",
-    "Command to execute on PYNQ:\n",
+    "Command to execute on PYNQ board:\n",
     "\n",
-    "```sudo pip3 install git+https://github.com/fbcotter/dataset_loading.git@0.0.4#egg=dataset_loading```"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "! ssh {options} -t {username}@{ip} -p {port} 'echo {password} | sudo -S pip3 install git+https://github.com/fbcotter/dataset_loading.git@0.0.4#egg=dataset_loading'"
+    "```shell\n",
+    "sudo pip3 install git+https://github.com/fbcotter/dataset_loading.git@0.0.4#egg=dataset_loading\n",
+    "```"
    ]
   },
   {
@@ -990,18 +958,11 @@
    "source": [
     "We can now use the `validate.py` script that was generated together with the driver to measure top-1 accuracy on the MNIST dataset.\n",
     "\n",
-    "Command to execute on PYNQ:\n",
+    "Command to execute on PYNQ board:\n",
     "\n",
-    "`python3.6 validate.py --dataset mnist --batchsize 1000`"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "! ssh {options} -t {username}@{ip} -p {port} 'cd {target_dir_pynq}; echo {password} | sudo -S python3.6 validate.py --dataset mnist --batchsize 1000'"
+    "```shell\n",
+    "sudo python3 validate.py --dataset mnist --batchsize 1000\n",
+    "```"
    ]
   },
   {
@@ -1016,60 +977,30 @@
    "metadata": {},
    "source": [
     "### Throughput Test on PYNQ Board <a id='throughput'></a>\n",
-    "In addition to the functional verification, FINN also offers the possibility to measure the network performance directly on the PYNQ board. This can be done using the core function `throughput_test`. In the next section we import the function and execute it.\n",
-    "First we extract the `remote_exec_model` again and pass it to the function. The function returns the metrics of the network as dictionary. "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from finn.core.throughput_test import throughput_test_remote\n",
-    "\n",
-    "model = ModelWrapper(build_dir + \"/tfc_w1_a1_pynq_deploy.onnx\")\n",
-    "res = throughput_test_remote(model, 10000)\n",
-    "print(\"Network metrics:\")\n",
-    "for key in res:\n",
-    "    print(str(key) + \": \" + str(res[key]))"
+    "In addition to the functional verification, FINN also offers the possibility to measure the network performance directly on the PYNQ board. This can be done setting the `exec_mode` to `throughput_test`. \n",
+    "Command to execute on PYNQ board:"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Together with the values for folding we can evaluate the performance of our accelerator. Each layer has a total folding factor of 64 and because the network is fully pipelined, it follows: `II = 64`. II is the initiation interval and indicates how many cycles are needed for one input to be processed. "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "II = 64\n",
-    "# frequency in MHz\n",
-    "f_MHz = 100\n",
-    "# expected throughput in MFPS\n",
-    "expected_throughput = f_MHz / II\n",
-    "# measured throughput (FPS) from throughput test, converted to MFPS\n",
-    "measured_throughput = res[\"throughput[images/s]\"] * 0.000001\n",
-    "# peformance\n",
-    "print(\"We reach approximately \" + str(round((measured_throughput / expected_throughput)*100)) + \"% of the ideal performance.\")"
+    "```shell\n",
+    "sudo python3 driver.py --exec_mode=throughput_test --batchsize=1000 --bitfile=resizer.bit\n",
+    "```"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "The measured values were recorded with a batch size of 10000 and at a frequency of 100 MHz. We will be improving the efficiency of the generated accelerator examples in the coming FINN releases."
+    "The network metrics from the throughput test are saved in a file called `nw_metrics.txt` on the PYNQ board. Which can be investigated after running the command above."
    ]
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -1083,7 +1014,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.5"
+   "version": "3.10.6"
   }
  },
  "nbformat": 4,
diff --git a/notebooks/end2end_example/bnn-pynq/tfc_end2end_verification.ipynb b/notebooks/end2end_example/bnn-pynq/tfc_end2end_verification.ipynb
index 813127197e..2f6cde6e5b 100644
--- a/notebooks/end2end_example/bnn-pynq/tfc_end2end_verification.ipynb
+++ b/notebooks/end2end_example/bnn-pynq/tfc_end2end_verification.ipynb
@@ -61,7 +61,7 @@
     "fc = get_test_model_trained(\"TFC\", 1, 1)\n",
     "raw_i = get_data(\"qonnx.data\", \"onnx/mnist-conv/test_data_set_0/input_0.pb\")\n",
     "input_tensor = onnx.load_tensor_from_string(raw_i)\n",
-    "input_brevitas = torch.from_numpy(nph.to_array(input_tensor)).float()\n",
+    "input_brevitas = torch.from_numpy(nph.to_array(input_tensor).copy()).float()\n",
     "output_golden = fc.forward(input_brevitas).detach().numpy()\n",
     "output_golden"
    ]
@@ -72,7 +72,7 @@
    "source": [
     "## Simulation using Python <a id='simpy'></a>\n",
     "\n",
-    "If an ONNX model consists of [standard ONNX](https://github.com/onnx/onnx/blob/master/docs/Operators.md) nodes and/or FINN custom operations that do not belong to the fpgadataflow (`backend` $\\neq$ `fpgadataflow`) this model can be checked for functionality using Python.\n",
+    "If an ONNX model consists of [standard ONNX](https://github.com/onnx/onnx/blob/main/docs/Operators.md) nodes and/or FINN custom operations that do not belong to the fpgadataflow (`backend` $\\neq$ `fpgadataflow`) this model can be checked for functionality using Python.\n",
     "\n",
     "To simulate a standard ONNX node [onnxruntime](https://github.com/microsoft/onnxruntime) is used. onnxruntime is an open source tool developed by Microsoft to run standard ONNX nodes. For the FINN custom op nodes execution, functions are defined. The following is an example of the execution function of a XNOR popcount node.\n"
    ]
@@ -121,12 +121,11 @@
     "output_dict = oxe.execute_onnx(model_for_sim, input_dict, return_full_exec_context=False)\n",
     "output_pysim = output_dict[list(output_dict.keys())[0]]\n",
     "\n",
-    "\n",
-    "\n",
-    "if np.isclose(output_pysim, np.where(output_golden[0]==np.amax(output_golden[0])), atol=1e-3).all():\n",
+    "try:\n",
+    "    assert np.isclose(output_pysim, np.where(output_golden[0]==np.amax(output_golden[0])), atol=1e-3).all()\n",
     "    print(\"Results are the same!\")\n",
-    "else:\n",
-    "    print(\"The results are not the same!\")"
+    "except AssertionError:\n",
+    "    assert False, \"The results are not the same!\""
    ]
   },
   {
@@ -268,10 +267,11 @@
     "output_dict = oxe.execute_onnx(parent_model, input_dict)\n",
     "output_cppsim = output_dict[list(output_dict.keys())[0]]\n",
     "\n",
-    "if np.isclose(output_cppsim, np.where(output_golden[0]==np.amax(output_golden[0])), atol=1e-3).all():\n",
+    "try:\n",
+    "    assert np.isclose(output_cppsim, np.where(output_golden[0]==np.amax(output_golden[0])), atol=1e-3).all()\n",
     "    print(\"Results are the same!\")\n",
-    "else:\n",
-    "    print(\"The results are not the same!\")"
+    "except AssertionError:\n",
+    "    assert False, \"The results are not the same!\""
    ]
   },
   {
@@ -356,10 +356,11 @@
     "output_dict = oxe.execute_onnx(model_for_rtlsim, input_dict)\n",
     "output_rtlsim = output_dict[list(output_dict.keys())[0]]\n",
     "\n",
-    "if np.isclose(output_rtlsim, np.where(output_golden[0]==np.amax(output_golden[0])), atol=1e-3).all():\n",
+    "try:\n",
+    "    assert np.isclose(output_rtlsim, np.where(output_golden[0]==np.amax(output_golden[0])), atol=1e-3).all()\n",
     "    print(\"Results are the same!\")\n",
-    "else:\n",
-    "    print(\"The results are not the same!\")"
+    "except AssertionError:\n",
+    "    assert False, \"The results are not the same!\""
    ]
   },
   {
@@ -383,7 +384,15 @@
     "\n",
     "child_model = ModelWrapper(build_dir + \"/tfc_w1_a1_dataflow_child.onnx\")\n",
     "child_model = child_model.transform(InsertDWC())\n",
-    "child_model = child_model.transform(InsertFIFO())\n",
+    "\n",
+    "# set all impl_styles of the DWCs to hls to enable emulation\n",
+    "dwc_nodes = child_model.get_nodes_by_op_type(\"StreamingDataWidthConverter_Batch\")\n",
+    "for dwc in dwc_nodes:\n",
+    "    dwc_inst = getCustomOp(dwc)\n",
+    "    dwc_inst.set_nodeattr(\"impl_style\", \"hls\")\n",
+    "    \n",
+    "child_model = child_model.transform(InsertFIFO(create_shallow_fifos=True))\n",
+    "child_model.save(build_dir + \"/test.onnx\");\n",
     "child_model = child_model.transform(GiveUniqueNodeNames())\n",
     "child_model = child_model.transform(PrepareIP(test_fpga_part, target_clk_ns))\n",
     "child_model = child_model.transform(HLSSynthIP())\n",
@@ -422,16 +431,17 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "if np.isclose(output_rtlsim, np.where(output_golden[0]==np.amax(output_golden[0])), atol=1e-3).all():\n",
+    "try:\n",
+    "    assert np.isclose(output_rtlsim, np.where(output_golden[0]==np.amax(output_golden[0])), atol=1e-3).all()\n",
     "    print(\"Results are the same!\")\n",
-    "else:\n",
-    "    print(\"The results are not the same!\")"
+    "except AssertionError:\n",
+    "    assert False, \"The results are not the same!\""
    ]
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
diff --git a/notebooks/end2end_example/cybersecurity/1-train-mlp-with-brevitas.ipynb b/notebooks/end2end_example/cybersecurity/1-train-mlp-with-brevitas.ipynb
index 5625a6f1c2..2885100512 100644
--- a/notebooks/end2end_example/cybersecurity/1-train-mlp-with-brevitas.ipynb
+++ b/notebooks/end2end_example/cybersecurity/1-train-mlp-with-brevitas.ipynb
@@ -53,7 +53,7 @@
     "    * [(Option 1) Train the Model from Scratch](#train_scratch)\n",
     "    * [(Option 2) Load Pre-Trained Parameters](#load_pretrained)\n",
     "* [Network Surgery Before Export](#network_surgery)\n",
-    "* [Export to FINN-ONNX](#export_finn_onnx)"
+    "* [Export to QONNX and Conversion to FINN-ONNX](#export_qonnx)"
    ]
   },
   {
@@ -62,8 +62,11 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "import os\n",
     "import onnx\n",
-    "import torch"
+    "import torch\n",
+    "\n",
+    "model_dir = os.environ['FINN_ROOT'] + \"/notebooks/end2end_example/cybersecurity\""
    ]
   },
   {
@@ -483,13 +486,14 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "import os\n",
     "import torch\n",
     "\n",
     "# Make sure the model is on CPU before loading a pretrained state_dict\n",
     "model = model.cpu()\n",
     "\n",
     "# Load pretrained weights\n",
-    "trained_state_dict = torch.load(\"state_dict.pth\")[\"models_state_dict\"][0]\n",
+    "trained_state_dict = torch.load(model_dir + \"/state_dict.pth\")[\"models_state_dict\"][0]\n",
     "\n",
     "model.load_state_dict(trained_state_dict, strict=False)"
    ]
@@ -663,12 +667,12 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Export to FINN-ONNX <a id=\"export_finn_onnx\" ></a>\n",
+    "# Export to QONNX and Conversion to FINN-ONNX <a id=\"export_qonnx\" ></a>\n",
     "\n",
     "\n",
     "[ONNX](https://onnx.ai/) is an open format built to represent machine learning models, and the FINN compiler expects an ONNX model as input. We'll now export our network into ONNX to be imported and used in FINN for the next notebooks. Note that the particular ONNX representation used for FINN differs from standard ONNX, you can read more about this [here](https://finn.readthedocs.io/en/latest/internals.html#intermediate-representation-finn-onnx).\n",
     "\n",
-    "You can see below how we export a trained network in Brevitas into a FINN-compatible ONNX representation. Note how we create a `QuantTensor` instance with dummy data to tell Brevitas how our inputs look like, which will be used to set the input quantization annotation on the exported model."
+    "You can see below how we export a trained network in Brevitas into a FINN-compatible ONNX representation (QONNX). QONNX is the format we can export from Brevitas, to feed it into the FINN compiler, we will need to make a conversion to the FINN-ONNX format which is the intermediate representation the compiler works on. The conversion of the FINN-ONNX format is a FINN compiler transformation and to be able to apply it to our model, we will need to wrap it into [ModelWrapper](https://finn.readthedocs.io/en/latest/internals.html#modelwrapper). This is a wrapper around the ONNX model which provides several helper functions to make it easier to work with the model. Then we can call the conversion function to obtain the model in FINN-ONNX format."
    ]
   },
   {
@@ -677,10 +681,12 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import brevitas.onnx as bo\n",
-    "from brevitas.quant_tensor import QuantTensor\n",
+    "from brevitas.export import export_qonnx\n",
+    "from qonnx.util.cleanup import cleanup as qonnx_cleanup\n",
+    "from qonnx.core.modelwrapper import ModelWrapper\n",
+    "from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN\n",
     "\n",
-    "ready_model_filename = \"cybsec-mlp-ready.onnx\"\n",
+    "ready_model_filename = model_dir + \"/cybsec-mlp-ready.onnx\"\n",
     "input_shape = (1, 600)\n",
     "\n",
     "# create a QuantTensor instance to mark input as bipolar during export\n",
@@ -688,18 +694,23 @@
     "input_a = 2 * input_a - 1\n",
     "scale = 1.0\n",
     "input_t = torch.from_numpy(input_a * scale)\n",
-    "input_qt = QuantTensor(\n",
-    "    input_t, scale=torch.tensor(scale), bit_width=torch.tensor(1.0), signed=True\n",
-    ")\n",
     "\n",
     "#Move to CPU before export\n",
     "model_for_export.cpu()\n",
     "\n",
     "# Export to ONNX\n",
-    "bo.export_finn_onnx(\n",
-    "    model_for_export, export_path=ready_model_filename, input_t=input_qt\n",
+    "export_qonnx(\n",
+    "    model_for_export, export_path=ready_model_filename, input_t=input_t\n",
     ")\n",
     "\n",
+    "# clean-up\n",
+    "qonnx_cleanup(ready_model_filename, out_file=ready_model_filename)\n",
+    "\n",
+    "# ModelWrapper\n",
+    "model = ModelWrapper(ready_model_filename)\n",
+    "model = model.transform(ConvertQONNXtoFINN())\n",
+    "model.save(ready_model_filename)\n",
+    "\n",
     "print(\"Model saved to %s\" % ready_model_filename)"
    ]
   },
@@ -741,7 +752,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -755,7 +766,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.5"
+   "version": "3.10.6"
   }
  },
  "nbformat": 4,
diff --git a/notebooks/end2end_example/cybersecurity/2-import-into-finn-and-verify.ipynb b/notebooks/end2end_example/cybersecurity/2-import-into-finn-and-verify.ipynb
index 370312c77e..a5bc165573 100644
--- a/notebooks/end2end_example/cybersecurity/2-import-into-finn-and-verify.ipynb
+++ b/notebooks/end2end_example/cybersecurity/2-import-into-finn-and-verify.ipynb
@@ -62,9 +62,11 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "import os\n",
     "from qonnx.core.modelwrapper import ModelWrapper\n",
     "\n",
-    "ready_model_filename = \"cybsec-mlp-ready.onnx\"\n",
+    "model_dir = os.environ['FINN_ROOT'] + \"/notebooks/end2end_example/cybersecurity\"\n",
+    "ready_model_filename = model_dir + \"/cybsec-mlp-ready.onnx\"\n",
     "model_for_sim = ModelWrapper(ready_model_filename)"
    ]
   },
@@ -151,7 +153,7 @@
     "model_for_sim = model_for_sim.transform(InferDataTypes())\n",
     "model_for_sim = model_for_sim.transform(RemoveStaticGraphInputs())\n",
     "\n",
-    "verif_model_filename = \"cybsec-mlp-verification.onnx\"\n",
+    "verif_model_filename = model_dir + \"/cybsec-mlp-verification.onnx\"\n",
     "model_for_sim.save(verif_model_filename)"
    ]
   },
@@ -258,7 +260,8 @@
     "\n",
     "# replace this with your trained network checkpoint if you're not\n",
     "# using the pretrained weights\n",
-    "trained_state_dict = torch.load(\"state_dict.pth\")[\"models_state_dict\"][0]\n",
+    "trained_state_dict = torch.load(model_dir + \"/state_dict.pth\")[\"models_state_dict\"][0]\n",
+    "\n",
     "# Uncomment the following line if you previously chose to train the network yourself\n",
     "#trained_state_dict = torch.load(\"state_dict_self-trained.pth\")\n",
     "\n",
@@ -365,10 +368,11 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "if ok == n_verification_inputs:\n",
+    "try:\n",
+    "    assert ok == n_verification_inputs\n",
     "    print(\"Verification succeeded. Brevitas and FINN-ONNX execution outputs are identical\")\n",
-    "else:\n",
-    "    print(\"Verification failed. Brevitas and FINN-ONNX execution outputs are NOT identical\")"
+    "except AssertionError:\n",
+    "    assert False, \"Verification failed. Brevitas and FINN-ONNX execution outputs are NOT identical\""
    ]
   },
   {
@@ -381,7 +385,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -395,7 +399,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.5"
+   "version": "3.10.6"
   }
  },
  "nbformat": 4,
diff --git a/notebooks/end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb b/notebooks/end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb
index 33adb68dc8..80f3cd3819 100644
--- a/notebooks/end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb
+++ b/notebooks/end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb
@@ -115,7 +115,8 @@
     "import os\n",
     "import shutil\n",
     "\n",
-    "model_file = \"cybsec-mlp-ready.onnx\"\n",
+    "model_dir = os.environ['FINN_ROOT'] + \"/notebooks/end2end_example/cybersecurity\"\n",
+    "model_file = model_dir + \"/cybsec-mlp-ready.onnx\"\n",
     "\n",
     "estimates_output_dir = \"output_estimates_only\"\n",
     "\n",
@@ -148,6 +149,15 @@
     "build.build_dataflow_cfg(model_file, cfg_estimates)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "assert os.path.exists(estimates_output_dir + \"/report/estimate_network_performance.json\")"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -272,7 +282,7 @@
     "import os\n",
     "import shutil\n",
     "\n",
-    "model_file = \"cybsec-mlp-ready.onnx\"\n",
+    "model_file = model_dir + \"/cybsec-mlp-ready.onnx\"\n",
     "\n",
     "rtlsim_output_dir = \"output_ipstitch_ooc_rtlsim\"\n",
     "\n",
@@ -305,6 +315,17 @@
     "build.build_dataflow_cfg(model_file, cfg_stitched_ip)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "assert os.path.exists(rtlsim_output_dir + \"/report/ooc_synth_and_timing.json\")\n",
+    "assert os.path.exists(rtlsim_output_dir + \"/report/rtlsim_performance.json\")\n",
+    "assert os.path.exists(rtlsim_output_dir + \"/final_hw_config.json\")"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -412,7 +433,7 @@
     "import os\n",
     "import shutil\n",
     "\n",
-    "model_file = \"cybsec-mlp-ready.onnx\"\n",
+    "model_file = model_dir + \"/cybsec-mlp-ready.onnx\"\n",
     "\n",
     "final_output_dir = \"output_final\"\n",
     "\n",
@@ -624,7 +645,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
diff --git a/notebooks/end2end_example/cybersecurity/dataloader_quantized.py b/notebooks/end2end_example/cybersecurity/dataloader_quantized.py
index 738811fa72..38505fb6ef 100644
--- a/notebooks/end2end_example/cybersecurity/dataloader_quantized.py
+++ b/notebooks/end2end_example/cybersecurity/dataloader_quantized.py
@@ -48,7 +48,6 @@ def __init__(
         onehot=False,
         train=True,
     ):
-
         self.dataframe = (
             pd.concat([pd.read_csv(file_path_train), pd.read_csv(file_path_test)])
             .reset_index()
@@ -77,9 +76,7 @@ def __getitem__(self, index):
         data_val = self.data[index][:-1]
         return data_val, target
 
-    def dec2bin(
-        self, column: pd.Series, number_of_bits: int, left_msb: bool = True
-    ) -> pd.Series:
+    def dec2bin(self, column: pd.Series, number_of_bits: int, left_msb: bool = True) -> pd.Series:
         """Convert a decimal pd.Series to binary pd.Series with numbers in their
         # base-2 equivalents.
         The output is a numpy nd array.
@@ -133,6 +130,7 @@ def integer_encoding(self, df):
     def quantize_df(self, df):
         """Quantized the input dataframe. The scaling is done by multiplying
         every column by the inverse of the minimum of that column"""
+
         # gets the smallest positive number of a vector
         def get_min_positive_number(vector):
             return vector[vector > 0].min()
@@ -178,24 +176,18 @@ def char_split(s):
             column_data = np.clip(
                 column_data, 0, 4294967295
             )  # clip due to overflow of uint32 of matlab code
-            column_data = self.round_like_matlab_series(
-                column_data
-            )  # round like matlab
+            column_data = self.round_like_matlab_series(column_data)  # round like matlab
             column_data = column_data.astype(np.uint32)  # cast like matlab
 
             if column == "rate":
                 column_data.update(pd.Series(dict_correct_rate_values))
 
             python_quantized_df[column] = (
-                self.dec2bin(column_data, maxbits, left_msb=False)
-                .reshape((-1, 1))
-                .flatten()
+                self.dec2bin(column_data, maxbits, left_msb=False).reshape((-1, 1)).flatten()
             )
 
         for column in python_quantized_df.columns:
-            python_quantized_df[column] = (
-                python_quantized_df[column].apply(char_split).values
-            )
+            python_quantized_df[column] = python_quantized_df[column].apply(char_split).values
 
         python_quantized_df_separated = pd.DataFrame(
             np.column_stack(python_quantized_df.values.T.tolist())
diff --git a/notebooks/end2end_example/cybersecurity/validate-unsw-nb15.py b/notebooks/end2end_example/cybersecurity/validate-unsw-nb15.py
index 0ffb525544..c4570616d2 100644
--- a/notebooks/end2end_example/cybersecurity/validate-unsw-nb15.py
+++ b/notebooks/end2end_example/cybersecurity/validate-unsw-nb15.py
@@ -57,9 +57,7 @@ def make_unsw_nb15_test_batches(bsize, dataset_root):
         help='name of bitfile (i.e. "resizer.bit")',
         default="../bitfile/finn-accel.bit",
     )
-    parser.add_argument(
-        "--dataset_root", help="dataset root dir for download/reuse", default="."
-    )
+    parser.add_argument("--dataset_root", help="dataset root dir for download/reuse", default=".")
     # parse arguments
     args = parser.parse_args()
     bsize = args.batchsize
diff --git a/requirements.txt b/requirements.txt
index 9038a5e817..e03eff2c98 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,19 +1,21 @@
 bitstring==3.1.7
-clize==4.1.1
+clize==5.0.1
 dataclasses-json==0.5.7
-docrep==0.2.7
-future==0.18.2
 gspread==3.6.0
-numpy==1.22.0
-onnx==1.11.0
+importlib-resources==6.1.0
+ipython==8.12.2
+numpy==1.24.1
+onnx==1.13.0
 onnxoptimizer
-onnxruntime==1.11.1
-pre-commit==2.9.2
-protobuf==3.20.2
-pyscaffold==3.2.1
-scipy==1.5.2
+onnxruntime==1.15.0
+pre-commit==3.3.2
+protobuf==3.20.3
+psutil==5.9.4
+pyscaffold==4.4
+scipy==1.10.1
 setupext-janitor>=1.1.2
-sigtools==2.0.3
-toposort==1.5
+setuptools==68.2.2
+sigtools==4.0.1
+toposort==1.7.0
 vcdvcd==1.0.5
 wget==3.2
diff --git a/run-docker.sh b/run-docker.sh
index 381be35293..58d7d97084 100755
--- a/run-docker.sh
+++ b/run-docker.sh
@@ -86,7 +86,7 @@ SCRIPTPATH=$(dirname "$SCRIPT")
 : ${ALVEO_BOARD="U250"}
 : ${ALVEO_TARGET_DIR="/tmp"}
 : ${PLATFORM_REPO_PATHS="/opt/xilinx/platforms"}
-: ${XRT_DEB_VERSION="xrt_202210.2.13.466_18.04-amd64-xrt"}
+: ${XRT_DEB_VERSION="xrt_202220.2.14.354_22.04-amd64-xrt"}
 : ${FINN_HOST_BUILD_DIR="/tmp/$DOCKER_INST_NAME"}
 : ${FINN_DOCKER_TAG="xilinx/finn:$(git describe --always --tags --dirty).$XRT_DEB_VERSION"}
 : ${FINN_DOCKER_PREBUILT="0"}
@@ -100,9 +100,12 @@ SCRIPTPATH=$(dirname "$SCRIPT")
 
 DOCKER_INTERACTIVE=""
 
+# Catch FINN_DOCKER_EXTRA options being passed in without a trailing space
+FINN_DOCKER_EXTRA+=" "
+
 if [ "$1" = "test" ]; then
   gecho "Running test suite (all tests)"
-  DOCKER_CMD="python setup.py test"
+  DOCKER_CMD="pytest"
 elif [ "$1" = "quicktest" ]; then
   gecho "Running test suite (non-Vivado, non-slow tests)"
   DOCKER_CMD="quicktest.sh"
@@ -201,6 +204,9 @@ DOCKER_EXEC+="-e PYNQ_PASSWORD=$PYNQ_PASSWORD "
 DOCKER_EXEC+="-e PYNQ_TARGET_DIR=$PYNQ_TARGET_DIR "
 DOCKER_EXEC+="-e OHMYXILINX=$OHMYXILINX "
 DOCKER_EXEC+="-e NUM_DEFAULT_WORKERS=$NUM_DEFAULT_WORKERS "
+# Workaround for FlexLM issue, see:
+# https://community.flexera.com/t5/InstallAnywhere-Forum/Issues-when-running-Xilinx-tools-or-Other-vendor-tools-in-docker/m-p/245820#M10647
+DOCKER_EXEC+="-e LD_PRELOAD=/lib/x86_64-linux-gnu/libudev.so.1 "
 if [ "$FINN_DOCKER_RUN_AS_ROOT" = "0" ];then
   DOCKER_EXEC+="-v /etc/group:/etc/group:ro "
   DOCKER_EXEC+="-v /etc/passwd:/etc/passwd:ro "
diff --git a/setup.cfg b/setup.cfg
index a1d0fef6cb..4834011dea 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -34,12 +34,12 @@
 name = finn
 description = A Framework for Fast, Scalable Quantized Neural Network Inference
 author = Yaman Umuroglu
-author-email = yamanu@xilinx.com
+author_email = yamanu@xilinx.com
 license = new-bsd
-long-description = file: README.md
-long-description-content-type = text/markdown
+long_description = file: README.md
+long_description_content_type = text/markdown
 url = https://xilinx.github.io/finn/
-project-urls =
+project_urls =
     Documentation = https://finn.readthedocs.io/
 # Change if running only on Windows, Mac or Linux (comma-separated)
 platforms = any
@@ -56,8 +56,6 @@ packages = find_namespace:
 include_package_data = True
 package_dir =
     =src
-# DON'T CHANGE THE FOLLOWING LINE! IT WILL BE UPDATED BY PYSCAFFOLD!
-setup_requires = pyscaffold>=3.2a0,<3.3a0
 # The usage of test_requires is discouraged, see `Dependency Management` docs
 # tests_require = pytest; pytest-cov
 # Require a specific Python version, e.g. Python 2.7 or >= 3.4
@@ -72,18 +70,22 @@ exclude =
 # Add here additional requirements for extra features, to install with:
 # `pip install FINN[PDF]` like:
 # PDF = ReportLab; RXP
-# finn-base is needed to build the full set of docs
+# qonnx is needed to build the full set of docs
 docs =
-    finn-base==0.0.3
     docutils==0.17.1
     dataclasses-json==0.5.7
     gspread==3.6.0
+    IPython
     pytest
     netron
     vcdvcd
+    sphinx==5.0.2
+    sphinx_rtd_theme==0.5.0
     torchvision
     torch
     qonnx@git+https://github.com/fastmachinelearning/qonnx@main#egg=qonnx
+    pyverilator@git+https://github.com/maltanar/pyverilator@master#egg=pyverilator
+    brevitas@git+https://github.com/Xilinx/brevitas@master#egg=brevitas_examples
 
 # Add here test requirements (semicolon/line-separated)
 testing =
@@ -125,6 +127,12 @@ markers =
     transform: mark tests that test transformations (before hls layers)
     fpgadataflow: mark tests related to hls layers
     end2end: mark tests that run the end2end flow
+    notebooks: mark tests that execute all Jupyter notebooks
+    sanity_bnn: mark tests that execute the sanity BNN test
+    bnn_u250: mark tests that execute U250 BNN tests
+    bnn_kv260: mark tests that execute KV260 BNN tests
+    bnn_pynq: mark tests that execute Pynq-Z1 BNN tests
+    bnn_zcu104: mark tests that execute ZCU104 BNN tests
 norecursedirs =
     dist
     build
diff --git a/setup.py b/setup.py
index 8fd781462c..9a06632af1 100644
--- a/setup.py
+++ b/setup.py
@@ -35,17 +35,7 @@
     PyScaffold helps you to put up the scaffold of your new Python project.
     Learn more under: https://pyscaffold.org/
 """
-from pkg_resources import VersionConflict, require
 from setuptools import setup
 
-import sys
-
-try:
-    require("setuptools>=38.3")
-except VersionConflict:
-    print("Error: version of setuptools is too old (<38.3)!")
-    sys.exit(1)
-
-
 if __name__ == "__main__":
-    setup(use_pyscaffold=True)
+    setup()
diff --git a/src/finn/analysis/fpgadataflow/dataflow_performance.py b/src/finn/analysis/fpgadataflow/dataflow_performance.py
index 5726702666..824690f5f6 100644
--- a/src/finn/analysis/fpgadataflow/dataflow_performance.py
+++ b/src/finn/analysis/fpgadataflow/dataflow_performance.py
@@ -66,9 +66,7 @@ def dataflow_performance(model):
                     max_pred_latency = 0
                 else:
                     # find max of any of predecessors
-                    pred_latencies = map(
-                        lambda x: latency_at_node_output[x.name], predecessors
-                    )
+                    pred_latencies = map(lambda x: latency_at_node_output[x.name], predecessors)
                     max_pred_latency = max(pred_latencies)
                 latency_at_node_output[node.name] = node_cycles + max_pred_latency
     critical_path_cycles = max(latency_at_node_output.values())
diff --git a/src/finn/analysis/fpgadataflow/post_synth_res.py b/src/finn/analysis/fpgadataflow/post_synth_res.py
index 8b9c5d2a04..3304b88d60 100644
--- a/src/finn/analysis/fpgadataflow/post_synth_res.py
+++ b/src/finn/analysis/fpgadataflow/post_synth_res.py
@@ -85,8 +85,8 @@ def get_instance_stats(inst_name):
         row = root.findall(".//*[@contents='%s']/.." % inst_name)
         if row != []:
             node_dict = {}
-            row = row[0].getchildren()
-            for (restype, ind) in restype_to_ind.items():
+            row = list(row[0])
+            for restype, ind in restype_to_ind.items():
                 node_dict[restype] = int(row[ind].attrib["contents"])
             return node_dict
         else:
diff --git a/src/finn/analysis/fpgadataflow/res_estimation.py b/src/finn/analysis/fpgadataflow/res_estimation.py
index 406496bc0e..be4cf417bc 100644
--- a/src/finn/analysis/fpgadataflow/res_estimation.py
+++ b/src/finn/analysis/fpgadataflow/res_estimation.py
@@ -62,10 +62,7 @@ def res_estimation_complete(model):
         if is_fpgadataflow_node(node) is True:
             op_type = node.op_type
             inst = registry.getCustomOp(node)
-            if (
-                op_type == "MatrixVectorActivation"
-                or op_type == "VectorVectorActivation"
-            ):
+            if op_type == "MatrixVectorActivation" or op_type == "VectorVectorActivation":
                 orig_restype = inst.get_nodeattr("resType")
                 res_dict[node.name] = []
                 inst.set_nodeattr("resType", "dsp")
diff --git a/src/finn/builder/build_dataflow.py b/src/finn/builder/build_dataflow.py
index 238083f653..284cd2baa3 100644
--- a/src/finn/builder/build_dataflow.py
+++ b/src/finn/builder/build_dataflow.py
@@ -91,12 +91,8 @@ def resolve_build_steps(cfg: DataflowBuildConfig, partial: bool = True):
     return steps_as_fxns
 
 
-def resolve_step_filename(
-    step_name: str, cfg: DataflowBuildConfig, step_delta: int = 0
-):
-    step_names = list(
-        map(lambda x: x.__name__, resolve_build_steps(cfg, partial=False))
-    )
+def resolve_step_filename(step_name: str, cfg: DataflowBuildConfig, step_delta: int = 0):
+    step_names = list(map(lambda x: x.__name__, resolve_build_steps(cfg, partial=False)))
     assert step_name in step_names, "start_step %s not found" + step_name
     step_no = step_names.index(step_name) + step_delta
     assert step_no >= 0, "Invalid step+delta combination"
@@ -150,17 +146,13 @@ def build_dataflow_cfg(model_filename, cfg: DataflowBuildConfig):
     for transform_step in build_dataflow_steps:
         try:
             step_name = transform_step.__name__
-            print(
-                "Running step: %s [%d/%d]"
-                % (step_name, step_num, len(build_dataflow_steps))
-            )
+            print("Running step: %s [%d/%d]" % (step_name, step_num, len(build_dataflow_steps)))
             # redirect output to logfile
-            sys.stdout = stdout_logger
-            sys.stderr = stderr_logger
-            print(
-                "Running step: %s [%d/%d]"
-                % (step_name, step_num, len(build_dataflow_steps))
-            )
+            if not cfg.verbose:
+                sys.stdout = stdout_logger
+                sys.stderr = stderr_logger
+                # also log current step name to logfile
+                print("Running step: %s [%d/%d]" % (step_name, step_num, len(build_dataflow_steps)))
             # run the step
             step_start = time.time()
             model = transform_step(model, cfg)
diff --git a/src/finn/builder/build_dataflow_config.py b/src/finn/builder/build_dataflow_config.py
index 09e9ec3a56..e4fed05731 100644
--- a/src/finn/builder/build_dataflow_config.py
+++ b/src/finn/builder/build_dataflow_config.py
@@ -37,6 +37,13 @@
 from finn.util.basic import alveo_default_platform, alveo_part_map, pynq_part_map
 
 
+class AutoFIFOSizingMethod(str, Enum):
+    "Select the type of automatic FIFO sizing strategy."
+
+    CHARACTERIZE = "characterize"
+    LARGEFIFO_RTLSIM = "largefifo_rtlsim"
+
+
 class ShellFlowType(str, Enum):
     """For builds that produce a bitfile, select the shell flow that will integrate
     the FINN-generated accelerator."""
@@ -112,6 +119,7 @@ class VerificationStepType(str, Enum):
     "step_create_dataflow_partition",
     "step_target_fps_parallelization",
     "step_apply_folding_config",
+    "step_minimize_bit_width",
     "step_generate_estimate_reports",
     "step_hls_codegen",
     "step_hls_ipgen",
@@ -133,6 +141,7 @@ class VerificationStepType(str, Enum):
     "step_create_dataflow_partition",
     "step_target_fps_parallelization",
     "step_apply_folding_config",
+    "step_minimize_bit_width",
     "step_generate_estimate_reports",
 ]
 
@@ -226,6 +235,12 @@ class DataflowBuildConfig:
     #: flexibility, and makes it possible to have runtime-writable thresholds.
     standalone_thresholds: Optional[bool] = False
 
+    #: (Optional) Whether optimizations that minimize the bit width of the
+    #: weights and accumulator will be applied. Because this optimization relies
+    #: on the the values of the weights, it will only be applied if runtime-
+    #: writeable weights is not enabled.
+    minimize_bit_width: Optional[bool] = True
+
     #: Target board, only needed for generating full bitfiles where the FINN
     #: design is integrated into a shell.
     #: e.g. "Pynq-Z1" or "U250"
@@ -246,6 +261,18 @@ class DataflowBuildConfig:
     #: for each FIFO.
     auto_fifo_depths: Optional[bool] = True
 
+    #: Whether FIFO nodes with depth larger than 32768 will be split.
+    #: Allow to configure very large FIFOs in the folding_config_file.
+    split_large_fifos: Optional[bool] = False
+
+    #: When `auto_fifo_depths = True`, select which method will be used for
+    #: setting the FIFO sizes.
+    auto_fifo_strategy: Optional[AutoFIFOSizingMethod] = AutoFIFOSizingMethod.LARGEFIFO_RTLSIM
+
+    #: Avoid using C++ rtlsim for auto FIFO sizing and rtlsim throughput test
+    #: if set to True, always using Python instead
+    force_python_rtlsim: Optional[bool] = False
+
     #: Memory resource type for large FIFOs
     #: Only relevant when `auto_fifo_depths = True`
     large_fifo_mem_style: Optional[LargeFIFOMemStyle] = LargeFIFOMemStyle.AUTO
@@ -258,6 +285,10 @@ class DataflowBuildConfig:
     #: Which memory mode will be used for compute layers
     default_mem_mode: Optional[ComputeEngineMemMode] = ComputeEngineMemMode.DECOUPLED
 
+    #: Force inference of RTL ConvolutionInputGenerator over HLS implementation
+    #: If set to False, falls back to the default behavior of InferConvInpGen()
+    force_rtl_conv_inp_gen: Optional[bool] = False
+
     #: Which Vitis platform will be used.
     #: Only relevant when `shell_flow_type = ShellFlowType.VITIS_ALVEO`
     #: e.g. "xilinx_u250_xdma_201830_2"
@@ -285,6 +316,10 @@ class DataflowBuildConfig:
     #: Whether pdb postmortem debuggig will be launched when the build fails
     enable_build_pdb_debug: Optional[bool] = True
 
+    #: When True, all warnings and compiler output will be printed in stdout.
+    #: Otherwise, these will be suppressed and only appear in the build log.
+    verbose: Optional[bool] = False
+
     #: If given, only run the steps in the list. If not, run default steps.
     #: See `default_build_dataflow_steps` for the default list of steps.
     #: When specified:
@@ -312,6 +347,10 @@ class DataflowBuildConfig:
     #: Override the number of inputs for rtlsim performance measurement.
     rtlsim_batch_size: Optional[int] = 1
 
+    #: If set to True, FIFOs and DWCs with impl_style=vivado will be kept during
+    #: rtlsim, otherwise they will be replaced by HLS implementations.
+    rtlsim_use_vivado_comps: Optional[bool] = True
+
     def _resolve_hls_clk_period(self):
         if self.hls_clk_period_ns is None:
             # use same clk for synth and hls if not explicitly specified
@@ -325,9 +364,7 @@ def _resolve_driver_platform(self):
         elif self.shell_flow_type == ShellFlowType.VITIS_ALVEO:
             return "alveo"
         else:
-            raise Exception(
-                "Couldn't resolve driver platform for " + str(self.shell_flow_type)
-            )
+            raise Exception("Couldn't resolve driver platform for " + str(self.shell_flow_type))
 
     def _resolve_fpga_part(self):
         if self.fpga_part is None:
@@ -369,8 +406,7 @@ def _resolve_vitis_platform(self):
             return alveo_default_platform[self.board]
         else:
             raise Exception(
-                "Could not resolve Vitis platform:"
-                " need either board or vitis_platform specified"
+                "Could not resolve Vitis platform:" " need either board or vitis_platform specified"
             )
 
     def _resolve_verification_steps(self):
@@ -388,8 +424,7 @@ def _resolve_verification_io_pair(self):
             )
             verify_input_npy = np.load(self.verify_input_npy)
             assert os.path.isfile(self.verify_expected_output_npy), (
-                "verify_expected_output_npy not found: "
-                + self.verify_expected_output_npy
+                "verify_expected_output_npy not found: " + self.verify_expected_output_npy
             )
             verify_expected_output_npy = np.load(self.verify_expected_output_npy)
             return (
diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py
index 59f77650da..54ba7e4ea1 100644
--- a/src/finn/builder/build_dataflow_steps.py
+++ b/src/finn/builder/build_dataflow_steps.py
@@ -29,6 +29,8 @@
 import json
 import numpy as np
 import os
+import shutil
+import warnings
 from copy import deepcopy
 from distutils.dir_util import copy_tree
 from qonnx.core.modelwrapper import ModelWrapper
@@ -78,13 +80,24 @@
     CreateDataflowPartition,
 )
 from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
+from finn.transformation.fpgadataflow.derive_characteristic import (
+    DeriveCharacteristic,
+    DeriveFIFOSizes,
+)
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
 from finn.transformation.fpgadataflow.insert_dwc import InsertDWC
 from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
 from finn.transformation.fpgadataflow.make_pynq_driver import MakePYNQDriver
 from finn.transformation.fpgadataflow.make_zynq_proj import ZynqBuild
+from finn.transformation.fpgadataflow.minimize_accumulator_width import (
+    MinimizeAccumulatorWidth,
+)
+from finn.transformation.fpgadataflow.minimize_weight_bit_width import (
+    MinimizeWeightBitWidth,
+)
 from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.transformation.fpgadataflow.replace_verilog_relpaths import (
     ReplaceVerilogRelPaths,
 )
@@ -92,6 +105,7 @@
 from finn.transformation.fpgadataflow.set_fifo_depths import (
     InsertAndSetFIFODepths,
     RemoveShallowFIFOs,
+    SplitLargeFIFOs,
 )
 from finn.transformation.fpgadataflow.set_folding import SetFolding
 from finn.transformation.fpgadataflow.synth_ooc import SynthOutOfContext
@@ -107,6 +121,7 @@
     get_rtlsim_trace_depth,
     pyverilate_get_liveness_threshold_cycles,
 )
+from finn.util.pyverilator import verilator_fifosim
 from finn.util.test import execute_parent
 
 
@@ -121,81 +136,120 @@ def verify_step(
     verify_out_dir = cfg.output_dir + "/verification_output"
     intermediate_models_dir = cfg.output_dir + "/intermediate_models"
     os.makedirs(verify_out_dir, exist_ok=True)
-    (in_npy, exp_out_npy) = cfg._resolve_verification_io_pair()
-    if need_parent:
-        assert (
-            cfg.save_intermediate_models
-        ), "Enable save_intermediate_models for verification"
-        parent_model_fn = intermediate_models_dir + "/dataflow_parent.onnx"
-        child_model_fn = intermediate_models_dir + "/verify_%s.onnx" % step_name
-        model.save(child_model_fn)
-        out_tensor_name = ModelWrapper(parent_model_fn).graph.output[0].name
-        out_dict = execute_parent(
-            parent_model_fn, child_model_fn, in_npy, return_full_ctx=True
-        )
-        out_npy = out_dict[out_tensor_name]
-    else:
-        inp_tensor_name = model.graph.input[0].name
-        out_tensor_name = model.graph.output[0].name
-        inp_dict = {inp_tensor_name: in_npy}
-        if rtlsim_pre_hook is not None:
-            out_dict = rtlsim_exec(model, inp_dict, pre_hook=rtlsim_pre_hook)
+    (in_npy_all, exp_out_npy_all) = cfg._resolve_verification_io_pair()
+    bsize_in = in_npy_all.shape[0]
+    bsize_out = exp_out_npy_all.shape[0]
+    assert bsize_in == bsize_out, "Batch sizes don't match for verification IO pair"
+    all_res = True
+    for b in range(bsize_in):
+        in_npy = np.expand_dims(in_npy_all[b], axis=0)
+        exp_out_npy = np.expand_dims(exp_out_npy_all[b], axis=0)
+        if need_parent:
+            assert cfg.save_intermediate_models, "Enable save_intermediate_models for verification"
+            parent_model_fn = intermediate_models_dir + "/dataflow_parent.onnx"
+            child_model_fn = intermediate_models_dir + "/verify_%s.onnx" % step_name
+            model.save(child_model_fn)
+            parent_model = ModelWrapper(parent_model_fn)
+            out_tensor_name = parent_model.graph.output[0].name
+            exp_ishape = parent_model.get_tensor_shape(parent_model.graph.input[0].name)
+            if in_npy.shape != exp_ishape:
+                print(
+                    "Verification input has shape %s while model expects %s"
+                    % (str(in_npy.shape), str(exp_ishape))
+                )
+                print("Attempting to force model shape on verification input")
+                in_npy = in_npy.reshape(exp_ishape)
+            out_dict = execute_parent(parent_model_fn, child_model_fn, in_npy, return_full_ctx=True)
+            out_npy = out_dict[out_tensor_name]
         else:
-            out_dict = execute_onnx(model, inp_dict, True)
-        out_npy = out_dict[out_tensor_name]
-    res = np.isclose(exp_out_npy, out_npy, atol=1e-3).all()
-    res_to_str = {True: "SUCCESS", False: "FAIL"}
-    res_str = res_to_str[res]
-    if cfg.verify_save_full_context:
-        verification_output_fn = verify_out_dir + "/verify_%s_%s.npz" % (
-            step_name,
-            res_str,
-        )
-        np.savez(verification_output_fn, **out_dict)
-    else:
-        verification_output_fn = verify_out_dir + "/verify_%s_%s.npy" % (
-            step_name,
-            res_str,
-        )
-        np.save(verification_output_fn, out_npy)
-    print("Verification for %s : %s" % (step_name, res_str))
+            inp_tensor_name = model.graph.input[0].name
+            out_tensor_name = model.graph.output[0].name
+            exp_ishape = model.get_tensor_shape(inp_tensor_name)
+            if in_npy.shape != exp_ishape:
+                print(
+                    "Verification input has shape %s while model expects %s"
+                    % (str(in_npy.shape), str(exp_ishape))
+                )
+                print("Attempting to force model shape on verification input")
+                in_npy = in_npy.reshape(exp_ishape)
+            inp_dict = {inp_tensor_name: in_npy}
+            if rtlsim_pre_hook is not None:
+                out_dict = rtlsim_exec(model, inp_dict, pre_hook=rtlsim_pre_hook)
+            else:
+                out_dict = execute_onnx(model, inp_dict, True)
+            out_npy = out_dict[out_tensor_name]
+        exp_oshape = exp_out_npy.shape
+        if out_npy.shape != exp_oshape:
+            print(
+                "Verification output has shape %s while model produces %s"
+                % (str(exp_oshape), str(out_npy.shape))
+            )
+            print("Attempting to force model shape on verification output")
+            out_npy = out_npy.reshape(exp_oshape)
+
+        res = np.isclose(exp_out_npy, out_npy, atol=1e-3).all()
+        all_res = all_res and res
+        res_to_str = {True: "SUCCESS", False: "FAIL"}
+        res_str = res_to_str[res]
+        if cfg.verify_save_full_context:
+            verification_output_fn = verify_out_dir + "/verify_%s_%d_%s.npz" % (
+                step_name,
+                b,
+                res_str,
+            )
+            np.savez(verification_output_fn, **out_dict)
+        else:
+            verification_output_fn = verify_out_dir + "/verify_%s_%d_%s.npy" % (
+                step_name,
+                b,
+                res_str,
+            )
+            np.save(verification_output_fn, out_npy)
+        if cfg.verify_save_rtlsim_waveforms:
+            vcd_path = model.get_metadata_prop("rtlsim_trace")
+            if vcd_path is not None and os.path.isfile(vcd_path):
+                new_vcd_path = vcd_path.replace(".vcd", "_%d.vcd" % b)
+                shutil.move(vcd_path, new_vcd_path)
+    print("Verification for %s : %s" % (step_name, res_to_str[all_res]))
 
 
 def prepare_for_stitched_ip_rtlsim(verify_model, cfg):
-    need_restitch = False
-    # rtlsim only supports certain impl_style for some nodes
-    # StreamingFIFO must have impl_style=rtl
-    for fifo_layer in verify_model.get_nodes_by_op_type("StreamingFIFO"):
-        inst = getCustomOp(fifo_layer)
-        if inst.get_nodeattr("impl_style") != "rtl":
-            inst.set_nodeattr("impl_style", "rtl")
-            inst.set_nodeattr("code_gen_dir_ipgen", "")
-            inst.set_nodeattr("ipgen_path", "")
-            need_restitch = True
-    # StreamingDataWidthConverter must have impl_style=hls
-    for dwc_layer in verify_model.get_nodes_by_op_type(
-        "StreamingDataWidthConverter_Batch"
-    ):
-        inst = getCustomOp(dwc_layer)
-        if inst.get_nodeattr("impl_style") != "hls":
-            inst.set_nodeattr("impl_style", "hls")
-            inst.set_nodeattr("code_gen_dir_ipgen", "")
-            inst.set_nodeattr("ipgen_path", "")
-            need_restitch = True
-    # if we've made alterations to the model, need to do some re-prep
-    if need_restitch:
-        print("Need to regen/re-stitch some IP for STITCHED_IP_RTLSIM")
-        verify_model = verify_model.transform(
-            PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period())
-        )
-        verify_model = verify_model.transform(HLSSynthIP())
-        verify_model = verify_model.transform(
-            CreateStitchedIP(
-                cfg._resolve_fpga_part(),
-                cfg.synth_clk_period_ns,
-                vitis=False,
+    if not cfg.rtlsim_use_vivado_comps:
+        need_restitch = False
+        # switch impl_style=vivado components to rtl/hls
+        # StreamingFIFO must have impl_style=rtl
+        for fifo_layer in verify_model.get_nodes_by_op_type("StreamingFIFO"):
+            inst = getCustomOp(fifo_layer)
+            if inst.get_nodeattr("impl_style") != "rtl":
+                inst.set_nodeattr("impl_style", "rtl")
+                inst.set_nodeattr("code_gen_dir_ipgen", "")
+                inst.set_nodeattr("ipgen_path", "")
+                need_restitch = True
+        # StreamingDataWidthConverter must have impl_style=hls
+        for dwc_layer in verify_model.get_nodes_by_op_type("StreamingDataWidthConverter_Batch"):
+            inst = getCustomOp(dwc_layer)
+            if inst.get_nodeattr("impl_style") != "hls":
+                inst.set_nodeattr("impl_style", "hls")
+                inst.set_nodeattr("code_gen_dir_ipgen", "")
+                inst.set_nodeattr("ipgen_path", "")
+                need_restitch = True
+        # if we've made alterations to the model, need to do some re-prep
+        if need_restitch:
+            print("Need to regen/re-stitch some IP for STITCHED_IP_RTLSIM")
+            verify_model = verify_model.transform(
+                PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period())
             )
-        )
+            verify_model = verify_model.transform(HLSSynthIP())
+            verify_model = verify_model.transform(
+                CreateStitchedIP(
+                    cfg._resolve_fpga_part(),
+                    cfg.synth_clk_period_ns,
+                    vitis=False,
+                )
+            )
+    else:
+        print("rtlsim_use_vivado_comps is enabled, may yield incorrect results")
+
     # set top-level prop for stitched-ip rtlsim and launch
     verify_model.set_metadata_prop("exec_mode", "rtlsim")
     # TODO make configurable
@@ -302,7 +356,10 @@ def step_convert_to_hls(model: ModelWrapper, cfg: DataflowBuildConfig):
     # needed for convolutions -- TODO always exec?
     need_conv = len(model.get_nodes_by_op_type("Im2Col")) > 0
     if need_conv:
-        model = model.transform(to_hls.InferConvInpGen())
+        if cfg.force_rtl_conv_inp_gen:
+            model = model.transform(to_hls.InferConvInpGen(use_rtl_variant=True))
+        else:
+            model = model.transform(to_hls.InferConvInpGen())
         model = model.transform(to_hls.InferStreamingMaxPool())
         model = model.transform(RemoveCNVtoFCFlatten())
     # get rid of Tranpose -> Tranpose identity seq
@@ -319,8 +376,7 @@ def step_create_dataflow_partition(model: ModelWrapper, cfg: DataflowBuildConfig
 
     parent_model = model.transform(
         CreateDataflowPartition(
-            partition_model_dir=cfg.output_dir
-            + "/intermediate_models/supported_op_partitions"
+            partition_model_dir=cfg.output_dir + "/intermediate_models/supported_op_partitions"
         )
     )
     sdp_nodes = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")
@@ -353,14 +409,13 @@ def step_target_fps_parallelization(model: ModelWrapper, cfg: DataflowBuildConfi
         hw_attrs = [
             "PE",
             "SIMD",
+            "parallel_window",
             "ram_style",
             "resType",
             "mem_mode",
             "runtime_writeable_weights",
         ]
-        extract_model_config_to_json(
-            model, cfg.output_dir + "/auto_folding_config.json", hw_attrs
-        )
+        extract_model_config_to_json(model, cfg.output_dir + "/auto_folding_config.json", hw_attrs)
 
     return model
 
@@ -395,9 +450,7 @@ def step_generate_estimate_reports(model: ModelWrapper, cfg: DataflowBuildConfig
         with open(report_dir + "/estimate_layer_cycles.json", "w") as f:
             json.dump(estimate_layer_cycles, f, indent=2)
         estimate_layer_resources = model.analysis(res_estimation)
-        estimate_layer_resources["total"] = aggregate_dict_keys(
-            estimate_layer_resources
-        )
+        estimate_layer_resources["total"] = aggregate_dict_keys(estimate_layer_resources)
         with open(report_dir + "/estimate_layer_resources.json", "w") as f:
             json.dump(estimate_layer_resources, f, indent=2)
         estimate_layer_resources_complete = model.analysis(res_estimation_complete)
@@ -411,8 +464,7 @@ def step_generate_estimate_reports(model: ModelWrapper, cfg: DataflowBuildConfig
         est_fps = n_clock_cycles_per_sec / estimate_network_performance["max_cycles"]
         estimate_network_performance["estimated_throughput_fps"] = est_fps
         est_latency_ns = (
-            estimate_network_performance["critical_path_cycles"]
-            * cfg.synth_clk_period_ns
+            estimate_network_performance["critical_path_cycles"] * cfg.synth_clk_period_ns
         )
         estimate_network_performance["estimated_latency_ns"] = est_latency_ns
         with open(report_dir + "/estimate_network_performance.json", "w") as f:
@@ -420,12 +472,20 @@ def step_generate_estimate_reports(model: ModelWrapper, cfg: DataflowBuildConfig
     return model
 
 
+def step_minimize_bit_width(model: ModelWrapper, cfg: DataflowBuildConfig):
+    """Tighten the weight and accumulator bit widths for each layer."""
+    if cfg.minimize_bit_width:
+        model = model.transform(MinimizeWeightBitWidth())
+        model = model.transform(MinimizeAccumulatorWidth())
+        # make sure the changed datatypes are propagated through the network
+        model = model.transform(InferDataTypes())
+    return model
+
+
 def step_hls_codegen(model: ModelWrapper, cfg: DataflowBuildConfig):
     "Generate Vivado HLS code to prepare HLSCustomOp nodes for IP generation."
 
-    model = model.transform(
-        PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period())
-    )
+    model = model.transform(PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period()))
     return model
 
 
@@ -446,9 +506,9 @@ def step_hls_ipgen(model: ModelWrapper, cfg: DataflowBuildConfig):
 def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig):
     """
     Depending on the auto_fifo_depths setting, do one of the following:
-    * if auto_fifo_depths=True:  Run the `InsertAndSetFIFODepths` transformation
-    to attempt to determine the FIFO sizes that provide full throughput. Involves
-    running stitched-IP rtlsim and may take a long time.
+    * if auto_fifo_depths=True:  Run the appropriate auto-sizing transformation
+    to attempt to determine the FIFO sizes that provide full throughput.
+    May take a long time.
     * if auto_fifo_depths=False:  Assume the folding config file contains FIFO
     sizes as well. Runs the `InsertFIFO` transformation, then
     `ApplyConfig(cfg.folding_config_file)`, and finally `RemoveShallowFIFOs`.
@@ -457,13 +517,48 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig):
     """
 
     if cfg.auto_fifo_depths:
-        model = model.transform(
-            InsertAndSetFIFODepths(
-                cfg._resolve_fpga_part(),
-                cfg._resolve_hls_clk_period(),
-                vivado_ram_style=cfg.large_fifo_mem_style,
+        if cfg.auto_fifo_strategy == "characterize":
+            model = model.transform(InsertDWC())
+            model = model.transform(GiveUniqueNodeNames())
+            model = model.transform(
+                PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period())
             )
-        )
+            model = model.transform(HLSSynthIP())
+            model = model.transform(PrepareRTLSim())
+            model = model.transform(AnnotateCycles())
+            period = model.analysis(dataflow_performance)["max_cycles"] + 10
+            model = model.transform(DeriveCharacteristic(period))
+            model = model.transform(DeriveFIFOSizes())
+            model = model.transform(
+                InsertFIFO(
+                    vivado_ram_style=cfg.large_fifo_mem_style,
+                    max_qsrl_depth=256,
+                    create_shallow_fifos=True,
+                )
+            )
+            model = model.transform(GiveUniqueNodeNames())
+            model = model.transform(GiveReadableTensorNames())
+        elif cfg.auto_fifo_strategy == "largefifo_rtlsim":
+            # multi-in/out streams currently not supported in our C++ verilator driver
+            model_multi_io = len(model.graph.input) > 1 or len(model.graph.output) > 1
+            force_python_sim = model_multi_io or cfg.force_python_rtlsim
+            if model_multi_io:
+                warnings.warn(
+                    "Multi-in/out streams currently not supported "
+                    + "in FINN C++ verilator driver, falling back to Python"
+                )
+            model = model.transform(
+                InsertAndSetFIFODepths(
+                    cfg._resolve_fpga_part(),
+                    cfg._resolve_hls_clk_period(),
+                    vivado_ram_style=cfg.large_fifo_mem_style,
+                    force_python_sim=force_python_sim,
+                )
+            )
+            # InsertAndSetFIFODepths internally removes any shallow FIFOs
+            # so no need to call RemoveShallowFIFOs here
+        else:
+            assert "Unsupported auto_fifo_strategy: " + cfg.auto_fifo_strategy
     else:
         # assume folding cfg json contains FIFO sizes too
         # insert DWCs, FIFOs and run ApplyConfig once more
@@ -475,29 +570,33 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig):
         model = model.transform(GiveReadableTensorNames())
         if cfg.folding_config_file is not None:
             model = model.transform(ApplyConfig(cfg.folding_config_file))
-        # remove any shallow FIFOs
-        model = model.transform(RemoveShallowFIFOs())
 
     # extract the final configuration and save it as json
     hw_attrs = [
         "PE",
         "SIMD",
+        "parallel_window",
         "ram_style",
         "depth",
         "impl_style",
         "resType",
         "mem_mode",
         "runtime_writeable_weights",
+        "inFIFODepths",
+        "outFIFODepths",
     ]
-    extract_model_config_to_json(
-        model, cfg.output_dir + "/final_hw_config.json", hw_attrs
-    )
+    extract_model_config_to_json(model, cfg.output_dir + "/final_hw_config.json", hw_attrs)
+
+    # perform FIFO splitting and shallow FIFO removal only after the final config
+    # json file has been written. otherwise, since these transforms may add/remove
+    # FIFOs, we get name mismatch problems when trying to reuse the final config.
+    if cfg.split_large_fifos:
+        model = model.transform(SplitLargeFIFOs())
+    model = model.transform(RemoveShallowFIFOs())
 
     # after FIFOs are ready to go, call PrepareIP and HLSSynthIP again
     # this will only run for the new nodes (e.g. FIFOs and DWCs)
-    model = model.transform(
-        PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period())
-    )
+    model = model.transform(PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period()))
     model = model.transform(HLSSynthIP())
     return model
 
@@ -534,9 +633,7 @@ def step_create_stitched_ip(model: ModelWrapper, cfg: DataflowBuildConfig):
         if cfg.verify_save_rtlsim_waveforms:
             report_dir = cfg.output_dir + "/report"
             os.makedirs(report_dir, exist_ok=True)
-            verify_model.set_metadata_prop(
-                "rtlsim_trace", "%s/verify_rtlsim.vcd" % (report_dir)
-            )
+            verify_model.set_metadata_prop("rtlsim_trace", "%s/verify_rtlsim.vcd" % (report_dir))
         verify_step(verify_model, cfg, "stitched_ip_rtlsim", need_parent=True)
         os.environ["LIVENESS_THRESHOLD"] = str(prev_liveness)
     return model
@@ -556,20 +653,58 @@ def step_measure_rtlsim_performance(model: ModelWrapper, cfg: DataflowBuildConfi
         # prepare ip-stitched rtlsim
         rtlsim_model = deepcopy(model)
         rtlsim_model = prepare_for_stitched_ip_rtlsim(rtlsim_model, cfg)
-        # run with single input to get latency
-        orig_rtlsim_trace_depth = get_rtlsim_trace_depth()
-        rtlsim_bs = int(cfg.rtlsim_batch_size)
-        assert rtlsim_bs > 0, "rtlsim batch size must be >0"
-        if cfg.verify_save_rtlsim_waveforms:
-            # set depth to 3 for layer-by-layer visibility
-            os.environ["RTLSIM_TRACE_DEPTH"] = "3"
-            rtlsim_model.set_metadata_prop(
-                "rtlsim_trace", "%s/rtlsim_perf_batch_%d.vcd" % (report_dir, rtlsim_bs)
+        # multi-in/out streams currently not supported in our C++ verilator driver
+        model_multi_io = len(rtlsim_model.graph.input) > 1 or len(rtlsim_model.graph.output) > 1
+        force_python_rtlsim = cfg.force_python_rtlsim or model_multi_io
+        if model_multi_io:
+            warnings.warn(
+                "Multi-in/out streams currently not supported "
+                + "in FINN C++ verilator driver, falling back to Python"
             )
-        rtlsim_model.set_metadata_prop("extra_verilator_args", str(["-CFLAGS", "-O3"]))
-        rtlsim_perf_dict = throughput_test_rtlsim(rtlsim_model, rtlsim_bs)
-        rtlsim_latency = rtlsim_perf_dict["cycles"]
-        rtlsim_perf_dict["latency_cycles"] = rtlsim_latency
+        rtlsim_bs = int(cfg.rtlsim_batch_size)
+        orig_rtlsim_trace_depth = get_rtlsim_trace_depth()
+        if force_python_rtlsim:
+            assert rtlsim_bs > 0, "rtlsim batch size must be >0"
+            if cfg.verify_save_rtlsim_waveforms:
+                # set depth to 3 for layer-by-layer visibility
+                os.environ["RTLSIM_TRACE_DEPTH"] = "3"
+                rtlsim_model.set_metadata_prop(
+                    "rtlsim_trace",
+                    "%s/rtlsim_perf_batch_%d.vcd" % (report_dir, rtlsim_bs),
+                )
+            rtlsim_model.set_metadata_prop("extra_verilator_args", str(["-CFLAGS", "-O3"]))
+            # run with single input to get latency
+            rtlsim_latency_dict = throughput_test_rtlsim(rtlsim_model, 1)
+            # run with batch to get stable-state throughput
+            rtlsim_perf_dict = throughput_test_rtlsim(rtlsim_model, rtlsim_bs)
+            rtlsim_perf_dict["latency_cycles"] = rtlsim_latency_dict["cycles"]
+        else:
+            rtlsim_perf_dict = verilator_fifosim(model, rtlsim_bs)
+            # keep keys consistent between the Python and C++-styles
+            cycles = rtlsim_perf_dict["cycles"]
+            clk_ns = float(model.get_metadata_prop("clk_ns"))
+            fclk_mhz = 1 / (clk_ns * 0.001)
+            runtime_s = (cycles * clk_ns) * (10**-9)
+            rtlsim_perf_dict["runtime[ms]"] = runtime_s * 1000
+            rtlsim_perf_dict["throughput[images/s]"] = rtlsim_bs / runtime_s
+            rtlsim_perf_dict["fclk[mhz]"] = fclk_mhz
+            for key, val in rtlsim_perf_dict.items():
+                if "max_count" in key:
+                    del rtlsim_perf_dict[key]
+        # estimate stable-state throughput based on latency+throughput
+        if rtlsim_bs == 1:
+            rtlsim_perf_dict["stable_throughput[images/s]"] = rtlsim_perf_dict[
+                "throughput[images/s]"
+            ]
+        else:
+            total_cycles = rtlsim_perf_dict["cycles"]
+            latency_cycles = rtlsim_perf_dict["latency_cycles"]
+            stablestate_cycles = total_cycles - latency_cycles
+            clk_ns = float(model.get_metadata_prop("clk_ns"))
+            fclk_mhz = 1 / (clk_ns * 0.001)
+            runtime_s = (stablestate_cycles * clk_ns) * (10**-9)
+            rtlsim_perf_dict["stable_throughput[images/s]"] = rtlsim_bs / runtime_s
+
         with open(report_dir + "/rtlsim_performance.json", "w") as f:
             json.dump(rtlsim_perf_dict, f, indent=2)
         if cfg.verify_save_rtlsim_waveforms:
@@ -595,13 +730,9 @@ def step_out_of_context_synthesis(model: ModelWrapper, cfg: DataflowBuildConfig)
     """Run out-of-context synthesis and generate reports.
     Depends on the DataflowOutputType.STITCHED_IP output product."""
     if DataflowOutputType.OOC_SYNTH in cfg.generate_outputs:
-        assert (
-            DataflowOutputType.STITCHED_IP in cfg.generate_outputs
-        ), "OOC needs stitched IP"
+        assert DataflowOutputType.STITCHED_IP in cfg.generate_outputs, "OOC needs stitched IP"
         model = model.transform(
-            SynthOutOfContext(
-                part=cfg._resolve_fpga_part(), clk_period_ns=cfg.synth_clk_period_ns
-            )
+            SynthOutOfContext(part=cfg._resolve_fpga_part(), clk_period_ns=cfg.synth_clk_period_ns)
         )
         report_dir = cfg.output_dir + "/report"
         os.makedirs(report_dir, exist_ok=True)
@@ -696,6 +827,7 @@ def step_deployment_package(model: ModelWrapper, cfg: DataflowBuildConfig):
     "step_create_dataflow_partition": step_create_dataflow_partition,
     "step_target_fps_parallelization": step_target_fps_parallelization,
     "step_apply_folding_config": step_apply_folding_config,
+    "step_minimize_bit_width": step_minimize_bit_width,
     "step_generate_estimate_reports": step_generate_estimate_reports,
     "step_hls_codegen": step_hls_codegen,
     "step_hls_ipgen": step_hls_ipgen,
diff --git a/src/finn/core/onnx_exec.py b/src/finn/core/onnx_exec.py
index 2695113661..588e97e9e4 100644
--- a/src/finn/core/onnx_exec.py
+++ b/src/finn/core/onnx_exec.py
@@ -31,13 +31,10 @@
 import qonnx.analysis.topology as ta
 from qonnx.core.onnx_exec import execute_onnx as execute_onnx_base
 
-from finn.core.remote_exec import remote_exec
 from finn.core.rtlsim_exec import rtlsim_exec
 
 
-def execute_onnx(
-    model, input_dict, return_full_exec_context=False, start_node=None, end_node=None
-):
+def execute_onnx(model, input_dict, return_full_exec_context=False, start_node=None, end_node=None):
     """Executes given ONNX ModelWrapper with given named inputs.
     If return_full_exec_context is False, a dict of named outputs is returned
     as indicated by the model.graph.output.
@@ -51,13 +48,10 @@ def execute_onnx(
 
     # check if model has an execution mode set
     # if None, execute model node using the QONNX-provided execute_onnx impl
-    # if set to "remote_pynq" execute model on PYNQ board
     # if set to "rtlsim" execute model using pyverilator
     model_exec_mode = model.get_metadata_prop("exec_mode")
     if (model_exec_mode is None) or (model_exec_mode == ""):
-        return execute_onnx_base(
-            model, input_dict, return_full_exec_context, start_node, end_node
-        )
+        return execute_onnx_base(model, input_dict, return_full_exec_context, start_node, end_node)
 
     if not model.check_all_tensor_shapes_specified():
         raise Exception("Found unspecified tensor shapes, try infer_shapes")
@@ -91,22 +85,17 @@ def execute_onnx(
 
     # check if model has an execution mode set
     # if None, execute model node by node using execute_node()
-    # if set to "remote_pynq" execute model on PYNQ board
     # if set to "rtlsim" execute model using pyverilator
     model_exec_mode = model.get_metadata_prop("exec_mode")
     if (model_exec_mode is None) or (model_exec_mode == ""):
         return execute_onnx_base()
-    elif model_exec_mode == "remote_pynq":
-        # use remote exec metadata built into model to execute on a remote PYNQ
-        remote_exec(model, execution_context)
     elif model_exec_mode == "rtlsim":
         # use stitched IP for rtlsim
         rtlsim_exec(model, execution_context)
     else:
         raise Exception(
-            """Metadata property "exec_mode" is set to an unknown value.
-        Can be left unset or has to be set to "remote_pynq" for remote execution
-        on PYNQ board or "rtlsim" for execution using pyverilator!"""
+            """Metadata property "exec_mode" is set to an unknown value. Can be left
+            unset or has to be set to "rtlsim" for execution using pyverilator!"""
         )
 
     if return_full_exec_context:
diff --git a/src/finn/core/remote_exec.py b/src/finn/core/remote_exec.py
deleted file mode 100644
index f487b48f86..0000000000
--- a/src/finn/core/remote_exec.py
+++ /dev/null
@@ -1,119 +0,0 @@
-# Copyright (c) 2020 Xilinx, Inc.
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice, this
-#   list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the following disclaimer in the documentation
-#   and/or other materials provided with the distribution.
-#
-# * Neither the name of Xilinx nor the names of its
-#   contributors may be used to endorse or promote products derived from
-#   this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import numpy as np
-import os
-import subprocess
-import warnings
-
-
-def remote_exec(model, execution_context):
-    """Executes the given model remotely on the pynq board. The metadata properties
-    related to the pynq board have to be set. The execution context contains the
-    input values."""
-    # TODO fix for multi input-output
-    pynq_ip = model.get_metadata_prop("pynq_ip")
-    pynq_port = int(model.get_metadata_prop("pynq_port"))
-    pynq_username = model.get_metadata_prop("pynq_username")
-    pynq_password = model.get_metadata_prop("pynq_password")
-    pynq_target_dir = model.get_metadata_prop("pynq_target_dir")
-    deployment_dir = model.get_metadata_prop("pynq_deploy_dir")
-    platform = model.get_metadata_prop("platform")
-    assert platform in ["alveo", "zynq-iodma"]
-    bitfile = model.get_metadata_prop("bitfile")
-    bitfile = os.path.basename(bitfile)
-    if pynq_password == "":
-        if "zynq" in platform:
-            raise Exception("PYNQ board remote exec needs password for sudo")
-        else:
-            local_prefix = ""  # assume we are using an ssh key
-            warnings.warn("Empty password, make sure you've set up an ssh key")
-    else:
-        local_prefix = "sshpass -p %s " % pynq_password
-
-    if platform == "alveo":
-        # Alveo can run without sudo
-        remote_prefix = ""
-    elif "zynq" in platform:
-        # PYNQ Zynq boards need to execute with sudo
-        remote_prefix = "echo %s | sudo -S " % pynq_password
-
-    inp = execution_context[model.graph.input[0].name]
-    # make copy of array before saving it
-    inp = inp.copy()
-    batchsize = inp.shape[0]
-    np.save(os.path.join(deployment_dir, "input.npy"), inp)
-    # extracting last folder of absolute path (deployment_dir)
-    deployment_folder = os.path.basename(os.path.normpath(deployment_dir))
-    # copy input to PYNQ board
-    cmd = local_prefix + "scp -P{} -r {}/input.npy {}@{}:{}/{}".format(
-        pynq_port,
-        deployment_dir,
-        pynq_username,
-        pynq_ip,
-        pynq_target_dir,
-        deployment_folder,
-    )
-    bash_command = ["/bin/bash", "-c", cmd]
-    process_scp_in = subprocess.Popen(bash_command, stdout=subprocess.PIPE)
-    process_scp_in.communicate()
-
-    # use platform attribute for correct remote execution
-    if platform == "alveo":
-        remote_cmd = "bash -ic 'bash alveo_run.sh execute %d' \"" % batchsize
-    else:
-        remote_cmd = (
-            "python3.6 driver.py --exec_mode=execute --batchsize={} "
-            "--bitfile={} --inputfile=input.npy --outputfile=output.npy "
-            '--platform={} "'
-        ).format(batchsize, bitfile, platform)
-    cmd = (
-        local_prefix + 'ssh {}@{} -p {} "cd {}/{}; ' + remote_prefix + remote_cmd
-    ).format(pynq_username, pynq_ip, pynq_port, pynq_target_dir, deployment_folder)
-    bash_command = ["/bin/bash", "-c", cmd]
-    process_exec_accel = subprocess.Popen(bash_command, stdout=subprocess.PIPE)
-    process_exec_accel.communicate()
-    # remove stale output file from local dir, if any
-    try:
-        os.remove("{}/output.npy".format(deployment_dir))
-    except FileNotFoundError:
-        pass
-    # copy generated output to local
-    cmd = local_prefix + "scp -P{} {}@{}:{}/{}/output.npy {}".format(
-        pynq_port,
-        pynq_username,
-        pynq_ip,
-        pynq_target_dir,
-        deployment_folder,
-        deployment_dir,
-    )
-    bash_command = ["/bin/bash", "-c", cmd]
-    process_scp_out = subprocess.Popen(bash_command, stdout=subprocess.PIPE)
-    process_scp_out.communicate()
-    outp = np.load("{}/output.npy".format(deployment_dir))
-    execution_context[model.graph.output[0].name] = outp
diff --git a/src/finn/core/throughput_test.py b/src/finn/core/throughput_test.py
index 3533fd1339..08633be33b 100644
--- a/src/finn/core/throughput_test.py
+++ b/src/finn/core/throughput_test.py
@@ -28,90 +28,11 @@
 
 import numpy as np
 import os
-import subprocess
-import warnings
 from qonnx.util.basic import gen_finn_dt_tensor
 
 from finn.core.rtlsim_exec import rtlsim_exec
 
 
-def throughput_test_remote(model, batchsize=1000, timeout=None):
-    """Runs the throughput test for the given model remotely on the pynq board.
-    The metadata properties related to the pynq board have to be set.
-    Additionally a timeout for the SSH communication can be set.
-    Returns a dictionary with results of the throughput test. Returns None
-    if the test fails."""
-
-    pynq_ip = model.get_metadata_prop("pynq_ip")
-    pynq_port = int(model.get_metadata_prop("pynq_port"))
-    pynq_username = model.get_metadata_prop("pynq_username")
-    pynq_password = model.get_metadata_prop("pynq_password")
-    pynq_target_dir = model.get_metadata_prop("pynq_target_dir")
-    deployment_dir = model.get_metadata_prop("pynq_deploy_dir")
-    # extracting last folder of absolute path (deployment_dir)
-    deployment_folder = os.path.basename(os.path.normpath(deployment_dir))
-    platform = model.get_metadata_prop("platform")
-    assert platform in ["alveo", "zynq-iodma"]
-    bitfile = model.get_metadata_prop("bitfile")
-    bitfile = os.path.basename(bitfile)
-    if pynq_password == "":
-        if "zynq" in platform:
-            raise Exception("PYNQ board remote exec needs password for sudo")
-        else:
-            local_prefix = ""  # assume we are using an ssh key
-            warnings.warn("Empty password, make sure you've set up an ssh key")
-    else:
-        local_prefix = "sshpass -p %s " % pynq_password
-
-    if platform == "alveo":
-        # Alveo can run without sudo but needs correct environment
-        remote_prefix = "conda activate finn-pynq-alveo; "
-    elif "zynq" in platform:
-        # PYNQ Zynq boards need to execute with sudo
-        remote_prefix = "echo %s | sudo -S " % pynq_password
-
-    # use platform attribute for correct remote execution
-    if platform == "alveo":
-        remote_cmd = "bash -ic 'bash alveo_run.sh throughput_test %d' \"" % batchsize
-    else:
-        remote_cmd = (
-            "python3.6 driver.py --exec_mode=throughput_test --batchsize={} "
-            "--bitfile={} --inputfile=input.npy --outputfile=output.npy "
-            '--platform={} "'
-        ).format(batchsize, bitfile, platform)
-    cmd = (
-        local_prefix + 'ssh {}@{} -p {} "cd {}/{}; ' + remote_prefix + remote_cmd
-    ).format(pynq_username, pynq_ip, pynq_port, pynq_target_dir, deployment_folder)
-    bash_command = ["/bin/bash", "-c", cmd]
-    process_throughput_test = subprocess.Popen(bash_command, stdout=subprocess.PIPE)
-    process_throughput_test.communicate(timeout=timeout)
-
-    # remove any pre-existing metrics file
-    try:
-        os.remove("{}/nw_metrics.txt".format(deployment_dir))
-    except FileNotFoundError:
-        pass
-
-    cmd = local_prefix + "scp -P{} {}@{}:{}/{}/nw_metrics.txt {}".format(
-        pynq_port,
-        pynq_username,
-        pynq_ip,
-        pynq_target_dir,
-        deployment_folder,
-        deployment_dir,
-    )
-    bash_command = ["/bin/bash", "-c", cmd]
-    process_compile = subprocess.Popen(bash_command, stdout=subprocess.PIPE)
-    process_compile.communicate(timeout=timeout)
-
-    try:
-        with open("{}/nw_metrics.txt".format(deployment_dir), "r") as file:
-            res = eval(file.read())
-        return res
-    except FileNotFoundError:
-        return None
-
-
 def throughput_test_rtlsim(model, batchsize=100):
     """Runs a throughput test for the given IP-stitched model. When combined
     with tracing, useful to determine bottlenecks and required FIFO sizes."""
diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py
index 2c7c86c64e..56d4230a3a 100644
--- a/src/finn/custom_op/fpgadataflow/__init__.py
+++ b/src/finn/custom_op/fpgadataflow/__init__.py
@@ -36,9 +36,14 @@
 from finn.custom_op.fpgadataflow.convolutioninputgenerator1d import (
     ConvolutionInputGenerator1D,
 )
+from finn.custom_op.fpgadataflow.convolutioninputgenerator_rtl import (
+    ConvolutionInputGenerator_rtl,
+)
 from finn.custom_op.fpgadataflow.downsampler import DownSampler
 from finn.custom_op.fpgadataflow.duplicatestreams_batch import DuplicateStreams_Batch
+from finn.custom_op.fpgadataflow.eltwise import StreamingEltwise
 from finn.custom_op.fpgadataflow.fmpadding_batch import FMPadding_Batch
+from finn.custom_op.fpgadataflow.fmpadding_rtl import FMPadding_rtl
 from finn.custom_op.fpgadataflow.globalaccpool_batch import GlobalAccPool_Batch
 from finn.custom_op.fpgadataflow.iodma import IODMA
 from finn.custom_op.fpgadataflow.labelselect_batch import LabelSelect_Batch
@@ -67,6 +72,7 @@
 custom_op["MatrixVectorActivation"] = MatrixVectorActivation
 custom_op["ConvolutionInputGenerator"] = ConvolutionInputGenerator
 custom_op["ConvolutionInputGenerator1D"] = ConvolutionInputGenerator1D
+custom_op["ConvolutionInputGenerator_rtl"] = ConvolutionInputGenerator_rtl
 custom_op["TLastMarker"] = TLastMarker
 custom_op["StreamingDataWidthConverter_Batch"] = StreamingDataWidthConverter_Batch
 custom_op["StreamingFIFO"] = StreamingFIFO
@@ -85,3 +91,5 @@
 custom_op["Lookup"] = Lookup
 custom_op["StreamingConcat"] = StreamingConcat
 custom_op["CheckSum"] = CheckSum
+custom_op["StreamingEltwise"] = StreamingEltwise
+custom_op["FMPadding_rtl"] = FMPadding_rtl
diff --git a/src/finn/custom_op/fpgadataflow/addstreams_batch.py b/src/finn/custom_op/fpgadataflow/addstreams_batch.py
index 13a4c5892c..51de1590ec 100644
--- a/src/finn/custom_op/fpgadataflow/addstreams_batch.py
+++ b/src/finn/custom_op/fpgadataflow/addstreams_batch.py
@@ -38,22 +38,25 @@
 class AddStreams_Batch(HLSCustomOp):
     """Class that corresponds to finn-hlslib AddStreams_Batch function."""
 
-    def __init__(self, onnx_node):
-        super().__init__(onnx_node)
+    def __init__(self, onnx_node, **kwargs):
+        super().__init__(onnx_node, **kwargs)
 
     def get_nodeattr_types(self):
-        my_attrs = {
-            "NumChannels": ("i", True, ""),
-            "PE": ("i", True, ""),
-            # FINN DataTypes for inputs; output datatype inferred from input
-            "inputDataType": ("s", True, ""),
-            # number of input vectors, examples:
-            # [1] is a single vector (like a FC layer with batch=1)
-            # [4] is four vectors (like a FC layer with batch=4)
-            # [1, 4, 4] is four * four vectors (like a conv layer with batch=1)
-            "numInputVectors": ("ints", False, [1]),
-        }
-        my_attrs.update(super().get_nodeattr_types())
+        my_attrs = super().get_nodeattr_types()
+        my_attrs.update(
+            {
+                "NumChannels": ("i", True, ""),
+                "PE": ("i", True, ""),
+                # FINN DataTypes for inputs; output datatype inferred from input
+                "inputDataType": ("s", True, ""),
+                # number of input vectors, examples:
+                # [1] is a single vector (like a FC layer with batch=1)
+                # [4] is four vectors (like a FC layer with batch=4)
+                # [1, 4, 4] is four * four vectors (like a conv layer with batch=1)
+                "numInputVectors": ("ints", False, [1]),
+                "inFIFODepths": ("ints", False, [2, 2]),
+            }
+        )
         return my_attrs
 
     def get_normal_input_shape(self, ind=0):
@@ -70,10 +73,10 @@ def get_folded_input_shape(self, ind=0):
         ishape = tuple(vecs + [ich // pe, pe])
         return ishape
 
-    def get_normal_output_shape(self):
+    def get_normal_output_shape(self, ind=0):
         return self.get_normal_input_shape()
 
-    def get_folded_output_shape(self):
+    def get_folded_output_shape(self, ind=0):
         return self.get_folded_input_shape()
 
     def make_shape_compatible_op(self, model):
@@ -118,17 +121,15 @@ def verify_node(self):
             self.get_nodeattr("inputDataType")
             info_messages.append("All necessary attributes exist")
         except Exception:
-            info_messages.append(
-                """The required LabelSelect_Batch attributes do not exist."""
-            )
+            info_messages.append("""The required LabelSelect_Batch attributes do not exist.""")
 
         return info_messages
 
-    def get_input_datatype(self):
+    def get_input_datatype(self, ind=0):
         """Returns FINN DataType of input."""
         return DataType[self.get_nodeattr("inputDataType")]
 
-    def get_output_datatype(self):
+    def get_output_datatype(self, ind=0):
         """Returns FINN DataType of output."""
         # we need to set output datatype to the next larger int or uint
         # enhancement: consider specifying w/ explicit outputDataType attribute
@@ -139,14 +140,14 @@ def get_output_datatype(self):
         else:
             return DataType.get_smallest_possible(2 * idt.max())
 
-    def get_instream_width(self):
+    def get_instream_width(self, ind=0):
         """Returns input stream width."""
         ibits = self.get_input_datatype().bitwidth()
         pe = self.get_nodeattr("PE")
         in_width = pe * ibits
         return in_width
 
-    def get_outstream_width(self):
+    def get_outstream_width(self, ind=0):
         """Returns output stream width."""
         obits = self.get_output_datatype().bitwidth()
         pe = self.get_nodeattr("PE")
@@ -181,9 +182,7 @@ def execute_node(self, context, graph):
 
         inp = context[node.input[0]]
         assert str(inp.dtype) == "float32", "Input datatype is not float32"
-        assert (
-            inp.shape == exp_ishape
-        ), """Input0 shape doesn't match expected shape ."""
+        assert inp.shape == exp_ishape, """Input0 shape doesn't match expected shape ."""
         export_idt = self.get_input_datatype()
         # reshape input into folded form
         inp = inp.reshape(folded_ishape)
@@ -194,9 +193,7 @@ def execute_node(self, context, graph):
         # exact same thing for input1
         inp = context[node.input[1]]
         assert str(inp.dtype) == "float32", "Input datatype is not float32"
-        assert (
-            inp.shape == exp_ishape
-        ), """Input1 shape doesn't match expected shape ."""
+        assert inp.shape == exp_ishape, """Input1 shape doesn't match expected shape ."""
         export_idt = self.get_input_datatype()
         # reshape input into folded form
         inp = inp.reshape(folded_ishape)
@@ -265,37 +262,60 @@ def read_npy_data(self):
         self.code_gen_dict["$READNPYDATA$"] = []
         npy_in = "%s/input_0.npy" % code_gen_dir
         self.code_gen_dict["$READNPYDATA$"].append(
-            'npy2apintstream<%s, %s, %d, %s>("%s", in0);'
-            % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in)
+            'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s);'
+            % (
+                packed_hls_type,
+                elem_hls_type,
+                elem_bits,
+                npy_type,
+                npy_in,
+                self.hls_sname(),
+            )
         )
         npy_in = "%s/input_1.npy" % code_gen_dir
         self.code_gen_dict["$READNPYDATA$"].append(
-            'npy2apintstream<%s, %s, %d, %s>("%s", in1);'
-            % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in)
+            'npy2apintstream<%s, %s, %d, %s>("%s", in1_%s);'
+            % (
+                packed_hls_type,
+                elem_hls_type,
+                elem_bits,
+                npy_type,
+                npy_in,
+                self.hls_sname(),
+            )
         )
 
     def strm_decl(self):
         self.code_gen_dict["$STREAMDECLARATIONS$"] = []
         self.code_gen_dict["$STREAMDECLARATIONS$"].append(
-            'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.get_instream_width())
+            'hls::stream<ap_uint<{}>> in0_{} ("in0_{}");'.format(
+                self.get_instream_width(), self.hls_sname(), self.hls_sname()
+            )
         )
         self.code_gen_dict["$STREAMDECLARATIONS$"].append(
-            'hls::stream<ap_uint<{}>> in1 ("in1");'.format(self.get_instream_width())
+            'hls::stream<ap_uint<{}>> in1_{} ("in1_{}");'.format(
+                self.get_instream_width(), self.hls_sname(), self.hls_sname()
+            )
         )
         self.code_gen_dict["$STREAMDECLARATIONS$"].append(
-            'hls::stream<ap_uint<{}>> out ("out");'.format(self.get_outstream_width())
+            'hls::stream<ap_uint<{}>> out_{} ("out_{}");'.format(
+                self.get_outstream_width(), self.hls_sname(), self.hls_sname()
+            )
         )
 
     def docompute(self):
         node = self.onnx_node
         self.code_gen_dict["$DOCOMPUTE$"] = [
-            """{}<{}, {}, {}, {}, {}> (in0, in1, out, 1);""".format(
+            """{}<{}, {}, {}, {}, {}> (in0_{}, in1_{}, out_{}, 1);""".format(
                 node.op_type,
                 self.get_nodeattr("PE"),
                 self.get_input_datatype().get_hls_datatype_str(),
                 self.get_input_datatype().get_hls_datatype_str(),
                 self.get_output_datatype().get_hls_datatype_str(),
                 self.get_number_output_values(),
+                self.hls_sname(),
+                self.hls_sname(),
+                self.hls_sname(),
             )
         ]
 
@@ -312,12 +332,13 @@ def dataoutstrm(self):
         oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}")
 
         self.code_gen_dict["$DATAOUTSTREAM$"] = [
-            'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s");'
+            'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s");'
             % (
                 packed_hls_type,
                 elem_hls_type,
                 elem_bits,
                 npy_type,
+                self.hls_sname(),
                 oshape_cpp_str,
                 npy_out,
             )
@@ -328,28 +349,29 @@ def save_as_npy(self):
 
     def blackboxfunction(self):
         self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
-            """void {}(hls::stream<ap_uint<{}>> &in0, hls::stream<ap_uint<{}>> &in1,
-                hls::stream<ap_uint<{}>> &out)""".format(
+            """void {}(hls::stream<ap_uint<{}>> &in0_{}, hls::stream<ap_uint<{}>> &in1_{},
+                hls::stream<ap_uint<{}>> &out_{})""".format(
                 self.onnx_node.name,
                 self.get_nodeattr("PE") * self.get_input_datatype().bitwidth(),
+                self.hls_sname(),
                 self.get_nodeattr("PE") * self.get_input_datatype().bitwidth(),
+                self.hls_sname(),
                 self.get_nodeattr("PE") * self.get_output_datatype().bitwidth(),
+                self.hls_sname(),
             )
         ]
 
     def pragmas(self):
         self.code_gen_dict["$PRAGMAS$"] = [
-            "#pragma HLS INTERFACE axis port=in0 name=in0_" + self.hls_sname()
+            "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname()
         ]
         self.code_gen_dict["$PRAGMAS$"].append(
-            "#pragma HLS INTERFACE axis port=in1 name=in1_" + self.hls_sname()
-        )
-        self.code_gen_dict["$PRAGMAS$"].append(
-            "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname()
+            "#pragma HLS INTERFACE axis port=in1_" + self.hls_sname()
         )
         self.code_gen_dict["$PRAGMAS$"].append(
-            "#pragma HLS INTERFACE ap_ctrl_none port=return"
+            "#pragma HLS INTERFACE axis port=out_" + self.hls_sname()
         )
+        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return")
 
     def get_verilog_top_module_intf_names(self):
         intf_names = super().get_verilog_top_module_intf_names()
@@ -357,3 +379,14 @@ def get_verilog_top_module_intf_names(self):
         swidth = self.get_instream_width_padded()
         intf_names["s_axis"] = [(x + "_" + sname, swidth) for x in ["in0", "in1"]]
         return intf_names
+
+    def derive_characteristic_fxns(self, period):
+        n_inps = np.prod(self.get_folded_input_shape()[:-1])
+        io_dict = {
+            "inputs": {
+                "in0": [0 for i in range(n_inps)],
+                "in1": [0 for i in range(n_inps)],
+            },
+            "outputs": {"out": []},
+        }
+        super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict)
diff --git a/src/finn/custom_op/fpgadataflow/channelwise_op_batch.py b/src/finn/custom_op/fpgadataflow/channelwise_op_batch.py
index 3ed76db298..5e0063ac33 100644
--- a/src/finn/custom_op/fpgadataflow/channelwise_op_batch.py
+++ b/src/finn/custom_op/fpgadataflow/channelwise_op_batch.py
@@ -39,8 +39,6 @@
     rtlsim_output_to_npy,
 )
 
-from . import templates
-
 # ONNX i/o tensor shape assumptions for channelwise ops:
 # input 0 is the input tensor, shape (..., NumChannels)
 # input 1 is the channelwise parameter tensor, shape (NumChannels, params_per_channel)
@@ -85,9 +83,8 @@ class ChannelwiseOp_Batch(HLSCustomOp):
     including Add, Mul and multi-thresholding.
     """
 
-    def __init__(self, onnx_node):
-        super().__init__(onnx_node)
-        self.decoupled_wrapper = templates.decoupled_wrapper
+    def __init__(self, onnx_node, **kwargs):
+        super().__init__(onnx_node, **kwargs)
 
     def get_nodeattr_types(self):
         my_attrs = {
@@ -102,9 +99,6 @@ def get_nodeattr_types(self):
             "inputDataType": ("s", True, ""),
             "paramDataType": ("s", True, ""),
             "outputDataType": ("s", True, ""),
-            # input and output FIFO depths
-            "inFIFODepth": ("i", False, 0),
-            "outFIFODepth": ("i", False, 0),
             # number of input vectors, examples:
             # [1] is a single vector (like a FC layer with batch=1)
             # [4] is four vectors (like a FC layer with batch=4)
@@ -184,9 +178,7 @@ def verify_node(self):
             self.get_nodeattr("outputDataType")
             info_messages.append("All necessary attributes exist")
         except Exception:
-            info_messages.append(
-                """The required Threshold_Batch attributes do not exist."""
-            )
+            info_messages.append("""The required Threshold_Batch attributes do not exist.""")
 
         return info_messages
 
@@ -221,23 +213,23 @@ def lut_estimation(self):
         # total cost
         return comparator_cost + lutram_cost
 
-    def get_input_datatype(self):
+    def get_input_datatype(self, ind=0):
         """Returns FINN DataType of input."""
         return DataType[self.get_nodeattr("inputDataType")]
 
-    def get_output_datatype(self):
+    def get_output_datatype(self, ind=0):
         """Returns FINN DataType of output."""
         return DataType[self.get_nodeattr("outputDataType")]
 
-    def get_instream_width(self):
+    def get_instream_width(self, ind=0):
         i_bits = self.get_input_datatype().bitwidth()
         return i_bits * self.get_nodeattr("PE")
 
-    def get_outstream_width(self):
+    def get_outstream_width(self, ind=0):
         o_bits = self.get_output_datatype().bitwidth()
         return o_bits * self.get_nodeattr("PE")
 
-    def get_folded_input_shape(self):
+    def get_folded_input_shape(self, ind=0):
         ich = self.get_nodeattr("NumChannels")
         pe = self.get_nodeattr("PE")
         fold = ich // pe
@@ -245,17 +237,17 @@ def get_folded_input_shape(self):
         folded_input_shape = tuple(vecs + [fold, pe])
         return folded_input_shape
 
-    def get_folded_output_shape(self):
+    def get_folded_output_shape(self, ind=0):
         # same shape as input
         return self.get_folded_input_shape()
 
-    def get_normal_input_shape(self):
+    def get_normal_input_shape(self, ind=0):
         ich = self.get_nodeattr("NumChannels")
         vecs = list(self.get_nodeattr("numInputVectors"))
         normal_input_shape = tuple(vecs + [ich])
         return normal_input_shape
 
-    def get_normal_output_shape(self):
+    def get_normal_output_shape(self, ind=0):
         # same shape as input
         return self.get_normal_input_shape()
 
@@ -306,9 +298,7 @@ def get_hls_compatible_parameter_tensor(self, orig_param_vector):
         assert (orig_param_vector.astype(np.int32) == orig_param_vector).all()
         ret = orig_param_vector
 
-        assert (
-            ret.shape[0] == chn
-        ), "Cardinality of parameter vector is not as expected (chn)"
+        assert ret.shape[0] == chn, "Cardinality of parameter vector is not as expected (chn)"
 
         # distribute rows between PEs
         ret = ret.reshape(tmem, pe).transpose()
@@ -330,9 +320,7 @@ def generate_params(self, model, path):
         parameter_tensor = self.get_hls_compatible_parameter_tensor(parameters)
         pdt = DataType[self.get_nodeattr("paramDataType")]
 
-        parameters_hls_code = numpy_to_hls_code(
-            parameter_tensor, pdt, "parameters", False, True
-        )
+        parameters_hls_code = numpy_to_hls_code(parameter_tensor, pdt, "parameters", False, True)
         # get input data type
         export_idt = self.get_input_datatype()
         if self.get_input_datatype() == DataType["BIPOLAR"]:
@@ -436,9 +424,7 @@ def execute_node(self, context, graph):
         elif mode == "rtlsim":
             sim = self.get_rtlsim()
             nbits = self.get_instream_width()
-            inp = npy_to_rtlsim_input(
-                "{}/input_0.npy".format(code_gen_dir), export_idt, nbits
-            )
+            inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits)
             super().reset_rtlsim(sim)
             super().toggle_clk(sim)
             output = self.rtlsim(sim, inp)
@@ -447,9 +433,7 @@ def execute_node(self, context, graph):
             packed_bits = self.get_outstream_width()
             out_npy_path = "{}/output.npy".format(code_gen_dir)
             out_shape = self.get_folded_output_shape()
-            rtlsim_output_to_npy(
-                output, out_npy_path, odt, out_shape, packed_bits, target_bits
-            )
+            rtlsim_output_to_npy(output, out_npy_path, odt, out_shape, packed_bits, target_bits)
 
             # load and reshape output
             output = np.load(out_npy_path)
@@ -492,17 +476,28 @@ def read_npy_data(self):
         self.code_gen_dict["$READNPYDATA$"] = []
         # note: the innermost dim is reversed for the input
         self.code_gen_dict["$READNPYDATA$"].append(
-            'npy2apintstream<%s, %s, %d, %s>("%s", in0, false);'
-            % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in)
+            'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s, false);'
+            % (
+                packed_hls_type,
+                elem_hls_type,
+                elem_bits,
+                npy_type,
+                npy_in,
+                self.hls_sname(),
+            )
         )
 
     def strm_decl(self):
         self.code_gen_dict["$STREAMDECLARATIONS$"] = []
         self.code_gen_dict["$STREAMDECLARATIONS$"].append(
-            'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.get_instream_width())
+            'hls::stream<ap_uint<{}>> in0_{} ("in0_{}");'.format(
+                self.get_instream_width(), self.hls_sname(), self.hls_sname()
+            )
         )
         self.code_gen_dict["$STREAMDECLARATIONS$"].append(
-            'hls::stream<ap_uint<{}>> out ("out");'.format(self.get_outstream_width())
+            'hls::stream<ap_uint<{}>> out_{} ("out_{}");'.format(
+                self.get_outstream_width(), self.hls_sname(), self.hls_sname()
+            )
         )
 
     def docompute(self):
@@ -518,10 +513,12 @@ def docompute(self):
             raise Exception("""Unexpeted input shape""")
         self.code_gen_dict["$DOCOMPUTE$"] = [
             """Thresholding_Batch<{}, NumChannels1, PE1, {}, {}>
-            (in0, out, threshs, numReps);""".format(
+            (in0_{}, out_{}, threshs, numReps);""".format(
                 spatial_dim,
                 tmpl_args["TSrcI"],
                 tmpl_args["TDstI"],
+                self.hls_sname(),
+                self.hls_sname(),
             )
         ]
 
@@ -542,12 +539,13 @@ def dataoutstrm(self):
 
         # note: the innermost dim is not reversed for the output
         self.code_gen_dict["$DATAOUTSTREAM$"] = [
-            'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s", false);'
+            'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s", false);'
             % (
                 packed_hls_type,
                 elem_hls_type,
                 elem_bits,
                 npy_type,
+                self.hls_sname(),
                 shape_cpp_str,
                 npy_out,
             )
@@ -558,34 +556,31 @@ def save_as_npy(self):
 
     def blackboxfunction(self):
         self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
-            """void {}(hls::stream<ap_uint<{}>> &in0,
-                hls::stream<ap_uint<{}>> &out
+            """void {}(hls::stream<ap_uint<{}>> &in0_{},
+                hls::stream<ap_uint<{}>> &out_{}
                 )""".format(
                 self.onnx_node.name,
                 self.get_instream_width(),
+                self.hls_sname(),
                 self.get_outstream_width(),
+                self.hls_sname(),
             )
         ]
 
     def pragmas(self):
         self.code_gen_dict["$PRAGMAS$"] = [
-            "#pragma HLS INTERFACE axis port=in0 name=in0_" + self.hls_sname()
+            "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname()
         ]
         self.code_gen_dict["$PRAGMAS$"].append(
-            "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname()
-        )
-        self.code_gen_dict["$PRAGMAS$"].append(
-            "#pragma HLS INTERFACE ap_ctrl_none port=return"
+            "#pragma HLS INTERFACE axis port=out_" + self.hls_sname()
         )
+        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return")
 
         # the channelwise parameter tensor is acc_type [PE][TMEM][N_PARAMS_PER_CHANNEL]
         # partition for parallel access along PE and N_PARAMS_PER_CHANNEL
         # dimensions (dims 1 and 3)
         self.code_gen_dict["$PRAGMAS$"].append(
-            (
-                "#pragma HLS ARRAY_PARTITION variable=threshs.parameters "
-                "complete dim=1"
-            )
+            ("#pragma HLS ARRAY_PARTITION variable=threshs.parameters " "complete dim=1")
         )
         # self.code_gen_dict["$PRAGMAS$"].append(
         #     (
@@ -603,17 +598,11 @@ def pragmas(self):
         if pe < ich:
             if ram_style == "distributed":
                 self.code_gen_dict["$PRAGMAS$"].append(
-                    (
-                        "#pragma HLS RESOURCE variable=threshs.parameters "
-                        "core=ROM_2P_LUTRAM"
-                    )
+                    ("#pragma HLS RESOURCE variable=threshs.parameters " "core=ROM_2P_LUTRAM")
                 )
             elif ram_style == "block":
                 self.code_gen_dict["$PRAGMAS$"].append(
-                    (
-                        "#pragma HLS RESOURCE variable=threshs.parameters "
-                        "core=ROM_2P_BRAM"
-                    )
+                    ("#pragma HLS RESOURCE variable=threshs.parameters " "core=ROM_2P_BRAM")
                 )
             else:
                 raise Exception(
diff --git a/src/finn/custom_op/fpgadataflow/checksum.py b/src/finn/custom_op/fpgadataflow/checksum.py
index bde285eb0d..6121c5d97a 100644
--- a/src/finn/custom_op/fpgadataflow/checksum.py
+++ b/src/finn/custom_op/fpgadataflow/checksum.py
@@ -38,8 +38,8 @@
 class CheckSum(HLSCustomOp):
     """Class that corresponds to custom_hls checksum function."""
 
-    def __init__(self, onnx_node):
-        super().__init__(onnx_node)
+    def __init__(self, onnx_node, **kwargs):
+        super().__init__(onnx_node, **kwargs)
 
     def get_nodeattr_types(self):
         my_attrs = {
@@ -77,31 +77,31 @@ def infer_node_datatype(self, model):
     def verify_node(self):
         pass
 
-    def get_input_datatype(self):
+    def get_input_datatype(self, ind=0):
         """Returns FINN DataType of input."""
         return DataType[self.get_nodeattr("inputDataType")]
 
-    def get_output_datatype(self):
+    def get_output_datatype(self, ind=0):
         """Returns FINN DataType of output."""
         # here same as input data type
         return DataType[self.get_nodeattr("inputDataType")]
 
-    def get_instream_width(self):
+    def get_instream_width(self, ind=0):
         dtype = DataType[self.get_nodeattr("inputDataType")]
         folded_shape = self.get_nodeattr("folded_shape")
         in_width = folded_shape[-1] * dtype.bitwidth()
         return in_width
 
-    def get_outstream_width(self):
+    def get_outstream_width(self, ind=0):
         return self.get_instream_width()
 
-    def get_folded_input_shape(self):
+    def get_folded_input_shape(self, ind=0):
         return self.get_nodeattr("folded_shape")
 
-    def get_folded_output_shape(self):
+    def get_folded_output_shape(self, ind=0):
         return self.get_nodeattr("folded_shape")
 
-    def get_normal_input_shape(self):
+    def get_normal_input_shape(self, ind=0):
         # derive normal shape from folded shape
         # checksum nodes are inserted in between fpgadataflow nodes
         # the folded shape could be for example (1, nf, pe)
@@ -127,7 +127,7 @@ def get_normal_input_shape(self):
     def get_ap_int_max_w(self):
         return max(super().get_ap_int_max_w(), 32)
 
-    def get_normal_output_shape(self):
+    def get_normal_output_shape(self, ind=0):
         # same shape as input
         return self.get_normal_input_shape()
 
@@ -183,9 +183,7 @@ def execute_node(self, context, graph):
             np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input)
             sim = self.get_rtlsim()
             nbits = self.get_instream_width()
-            inp = npy_to_rtlsim_input(
-                "{}/input_0.npy".format(code_gen_dir), export_idt, nbits
-            )
+            inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits)
             super().reset_rtlsim(sim)
             super().toggle_clk(sim)
             io_dict = {
@@ -199,9 +197,7 @@ def execute_node(self, context, graph):
             packed_bits = self.get_outstream_width()
             out_npy_path = "{}/output.npy".format(code_gen_dir)
             out_shape = self.get_folded_output_shape()
-            rtlsim_output_to_npy(
-                output, out_npy_path, odt, out_shape, packed_bits, target_bits
-            )
+            rtlsim_output_to_npy(output, out_npy_path, odt, out_shape, packed_bits, target_bits)
 
             # load and reshape output
             output = np.load(out_npy_path)
@@ -241,17 +237,28 @@ def read_npy_data(self):
         self.code_gen_dict["$READNPYDATA$"] = []
         # note: the innermost dim is reversed for the input
         self.code_gen_dict["$READNPYDATA$"].append(
-            'npy2apintstream<%s, %s, %d, %s>("%s", in0, false);'
-            % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in)
+            'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s, false);'
+            % (
+                packed_hls_type,
+                elem_hls_type,
+                elem_bits,
+                npy_type,
+                npy_in,
+                self.hls_sname(),
+            )
         )
 
     def strm_decl(self):
         self.code_gen_dict["$STREAMDECLARATIONS$"] = []
         self.code_gen_dict["$STREAMDECLARATIONS$"].append(
-            'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.get_instream_width())
+            'hls::stream<ap_uint<{}>> in0_{} ("in0_{}");'.format(
+                self.get_instream_width(), self.hls_sname(), self.hls_sname()
+            )
         )
         self.code_gen_dict["$STREAMDECLARATIONS$"].append(
-            'hls::stream<ap_uint<{}>> out ("out");'.format(self.get_outstream_width())
+            'hls::stream<ap_uint<{}>> out_{} ("out_{}");'.format(
+                self.get_outstream_width(), self.hls_sname(), self.hls_sname()
+            )
         )
         self.code_gen_dict["$STREAMDECLARATIONS$"].append("ap_uint<32> chk;")
         # set drain = false for cppsim
@@ -259,7 +266,8 @@ def strm_decl(self):
 
     def docompute(self):
         self.code_gen_dict["$DOCOMPUTE$"] = [
-            """checksum<WORDS_PER_FRAME, ITEMS_PER_WORD>(in0, out, chk, drain);"""
+            """checksum<WORDS_PER_FRAME, ITEMS_PER_WORD>(in0_%s, out_%s, chk, drain);"""
+            % (self.hls_sname(), self.hls_sname())
         ]
 
     def dataoutstrm(self):
@@ -279,19 +287,19 @@ def dataoutstrm(self):
 
         # note: the innermost dim is not reversed for the output
         self.code_gen_dict["$DATAOUTSTREAM$"] = [
-            'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s", false);'
+            'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s", false);'
             % (
                 packed_hls_type,
                 elem_hls_type,
                 elem_bits,
                 npy_type,
+                self.hls_sname(),
                 shape_cpp_str,
                 npy_out,
             ),
             "std::vector<unsigned int> checksum(1);",
             "checksum[0] = chk;",
-            'cnpy::npy_save("%s/output_checksum.npy",&checksum[0],{1},"w");'
-            % code_gen_dir,
+            'cnpy::npy_save("%s/output_checksum.npy",&checksum[0],{1},"w");' % code_gen_dir,
         ]
 
     def save_as_npy(self):
@@ -299,18 +307,18 @@ def save_as_npy(self):
 
     def blackboxfunction(self):
         self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
-            """using T = ap_uint<WORD_SIZE>;\n void {}(hls::stream<T> &in0,
-            hls::stream<T> &out, ap_uint<32> &chk, ap_uint<1> &drain)""".format(
-                self.onnx_node.name
+            """using T = ap_uint<WORD_SIZE>;\n void {}(hls::stream<T> &in0_{},
+            hls::stream<T> &out_{}, ap_uint<32> &chk, ap_uint<1> &drain)""".format(
+                self.onnx_node.name, self.hls_sname(), self.hls_sname()
             )
         ]
 
     def pragmas(self):
         self.code_gen_dict["$PRAGMAS$"] = [
-            "#pragma HLS interface axis port=in0 name=in0_" + self.hls_sname()
+            "#pragma HLS interface axis port=in0_" + self.hls_sname()
         ]
         self.code_gen_dict["$PRAGMAS$"].append(
-            "#pragma HLS interface axis port=out name=out_" + self.hls_sname()
+            "#pragma HLS interface axis port=out_" + self.hls_sname()
         )
         self.code_gen_dict["$PRAGMAS$"].append(
             "#pragma HLS interface s_axilite port=chk bundle=checksum"
@@ -318,13 +326,9 @@ def pragmas(self):
         self.code_gen_dict["$PRAGMAS$"].append(
             "#pragma HLS interface s_axilite port=drain bundle=checksum"
         )
-        self.code_gen_dict["$PRAGMAS$"].append(
-            "#pragma HLS interface ap_ctrl_none port=return"
-        )
+        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS interface ap_ctrl_none port=return")
         self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS dataflow")
-        self.code_gen_dict["$PRAGMAS$"].append(
-            "#pragma HLS dataflow disable_start_propagation"
-        )
+        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS dataflow disable_start_propagation")
 
     def get_verilog_top_module_intf_names(self):
         intf_names = super().get_verilog_top_module_intf_names()
diff --git a/src/finn/custom_op/fpgadataflow/concat.py b/src/finn/custom_op/fpgadataflow/concat.py
index 5fcf9cf96c..8c24dadbeb 100644
--- a/src/finn/custom_op/fpgadataflow/concat.py
+++ b/src/finn/custom_op/fpgadataflow/concat.py
@@ -39,8 +39,8 @@ class StreamingConcat(HLSCustomOp):
     """Streaming concatenation node with dynamically generated HLS.
     Only supports concatenating along the last axis."""
 
-    def __init__(self, onnx_node):
-        super().__init__(onnx_node)
+    def __init__(self, onnx_node, **kwargs):
+        super().__init__(onnx_node, **kwargs)
 
     def get_nodeattr_types(self):
         my_attrs = {
@@ -74,12 +74,12 @@ def get_normal_input_shape(self, ind=0):
     def get_folded_input_shape(self, ind=0):
         return self.get_normal_input_shape(ind)
 
-    def get_normal_output_shape(self):
+    def get_normal_output_shape(self, ind=0):
         total_elems = self.get_total_elems()
         vecs = list(self.get_nodeattr("numInputVectors"))
         return tuple(vecs + [total_elems])
 
-    def get_folded_output_shape(self):
+    def get_folded_output_shape(self, ind=0):
         return self.get_normal_output_shape()
 
     def make_shape_compatible_op(self, model):
@@ -106,7 +106,7 @@ def get_input_datatype(self, ind=0):
         # input dt identical for all inputs
         return DataType[self.get_nodeattr("inputDataType")]
 
-    def get_output_datatype(self):
+    def get_output_datatype(self, ind=0):
         return self.get_input_datatype()
 
     def get_instream_width(self, ind=0):
@@ -115,7 +115,7 @@ def get_instream_width(self, ind=0):
         ibits = self.get_input_datatype().bitwidth()
         return elems * ibits
 
-    def get_outstream_width(self):
+    def get_outstream_width(self, ind=0):
         obits = self.get_output_datatype().bitwidth()
         total_elems = self.get_total_elems()
         out_width = total_elems * obits
@@ -134,7 +134,7 @@ def generate_params(self, model, path):
         idt = self.get_input_datatype()
         total_elems = self.get_total_elems()
         total_bw = idt.bitwidth() * total_elems
-        for (i, elems) in enumerate(elems_per_stream):
+        for i, elems in enumerate(elems_per_stream):
             bw = idt.bitwidth() * elems
             inp_stream = "hls::stream<ap_uint<%d> > &in%d" % (bw, i)
             inp_streams.append(inp_stream)
@@ -278,8 +278,16 @@ def read_npy_data(self):
             packed_hls_type = "ap_uint<%d>" % packed_bits
             npy_in = "%s/input_%d.npy" % (code_gen_dir, i)
             self.code_gen_dict["$READNPYDATA$"].append(
-                'npy2apintstream<%s, %s, %d, %s>("%s", in%d);'
-                % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in, i)
+                'npy2apintstream<%s, %s, %d, %s>("%s", in%d_%s);'
+                % (
+                    packed_hls_type,
+                    elem_hls_type,
+                    elem_bits,
+                    npy_type,
+                    npy_in,
+                    i,
+                    self.hls_sname(),
+                )
             )
 
     def strm_decl(self):
@@ -288,21 +296,27 @@ def strm_decl(self):
         for i in range(n_inputs):
             packed_bits = self.get_instream_width(i)
             packed_hls_type = "ap_uint<%d>" % packed_bits
-            stream_name = "in%d" % i
+            stream_name = "in%d_%s" % (i, self.hls_sname())
             self.code_gen_dict["$STREAMDECLARATIONS$"].append(
-                'hls::stream<%s> %s ("%s");'
-                % (packed_hls_type, stream_name, stream_name)
+                'hls::stream<%s> %s ("%s");' % (packed_hls_type, stream_name, stream_name)
             )
         self.code_gen_dict["$STREAMDECLARATIONS$"].append(
-            'hls::stream<ap_uint<{}>> out ("out");'.format(self.get_outstream_width())
+            'hls::stream<ap_uint<{}>> out_{} ("out_{}");'.format(
+                self.get_outstream_width(), self.hls_sname(), self.hls_sname()
+            )
         )
 
     def docompute(self):
         self.code_gen_dict["$DOCOMPUTE$"] = []
         n_inputs = self.get_n_inputs()
-        in_stream_names = ["in%d" % x for x in range(n_inputs)]
-        in_stream_names = ",".join(in_stream_names)
-        comp_call = "StreamingConcat(%s, out, NumReps);" % (in_stream_names)
+        in_streams = []
+        for i in range(n_inputs):
+            in_streams.append("in%d_%s" % (i, self.hls_sname()))
+        in_stream_names = ",".join(in_streams)
+        comp_call = "StreamingConcat(%s, out_%s, NumReps);" % (
+            in_stream_names,
+            self.hls_sname(),
+        )
         self.code_gen_dict["$DOCOMPUTE$"] = [comp_call]
 
     def dataoutstrm(self):
@@ -318,12 +332,13 @@ def dataoutstrm(self):
         oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}")
 
         self.code_gen_dict["$DATAOUTSTREAM$"] = [
-            'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s");'
+            'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s");'
             % (
                 packed_hls_type,
                 elem_hls_type,
                 elem_bits,
                 npy_type,
+                self.hls_sname(),
                 oshape_cpp_str,
                 npy_out,
             )
@@ -337,10 +352,13 @@ def blackboxfunction(self):
         in_streams = []
         for i in range(n_inputs):
             iwidth = self.get_instream_width(i)
-            in_streams.append("hls::stream<ap_uint<%d>> &in%d" % (iwidth, i))
+            in_streams.append("hls::stream<ap_uint<%d>> &in%d_%s" % (iwidth, i, self.hls_sname()))
         in_streams = ",".join(in_streams)
         total_width = self.get_input_datatype().bitwidth() * self.get_total_elems()
-        out_stream = "hls::stream<ap_uint<%d>> &out" % (total_width)
+        out_stream = "hls::stream<ap_uint<%d>> &out_%s" % (
+            total_width,
+            self.hls_sname(),
+        )
         blackbox_hls = "void %s(%s, %s)" % (self.onnx_node.name, in_streams, out_stream)
         self.code_gen_dict["$BLACKBOXFUNCTION$"] = [blackbox_hls]
 
@@ -348,17 +366,12 @@ def pragmas(self):
         n_inputs = self.get_n_inputs()
         pragmas = []
         for i in range(n_inputs):
-            pragmas.append(
-                "#pragma HLS INTERFACE axis port=in%d name=in%d_%s"
-                % (i, i, self.hls_sname())
-            )
+            pragmas.append("#pragma HLS INTERFACE axis port=in%d_%s" % (i, self.hls_sname()))
         self.code_gen_dict["$PRAGMAS$"] = pragmas
         self.code_gen_dict["$PRAGMAS$"].append(
-            "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname()
-        )
-        self.code_gen_dict["$PRAGMAS$"].append(
-            "#pragma HLS INTERFACE ap_ctrl_none port=return"
+            "#pragma HLS INTERFACE axis port=out_" + self.hls_sname()
         )
+        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return")
 
     def get_instream_width_padded(self, ind=0):
         in_width = self.get_instream_width(ind)
@@ -370,7 +383,5 @@ def get_verilog_top_module_intf_names(self):
         sname = self.hls_sname()
         intf_names["s_axis"] = []
         for i in range(n_inputs):
-            intf_names["s_axis"].append(
-                ("in%d_%s" % (i, sname), self.get_instream_width_padded(i))
-            )
+            intf_names["s_axis"].append(("in%d_%s" % (i, sname), self.get_instream_width_padded(i)))
         return intf_names
diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py
index 251a9882c5..33c542d79d 100644
--- a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py
+++ b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py
@@ -54,8 +54,8 @@ class ConvolutionInputGenerator(HLSCustomOp):
     attributes (e.g. depthwise or not, whether k % stride is 0) a different
     variant will be picked for the actual HLS implementation."""
 
-    def __init__(self, onnx_node):
-        super().__init__(onnx_node)
+    def __init__(self, onnx_node, **kwargs):
+        super().__init__(onnx_node, **kwargs)
 
     def get_nodeattr_types(self):
         my_attrs = {
@@ -99,13 +99,13 @@ def get_nodeattr(self, name):
             assert ret[0] == ret[1] == 1, "Only dilation=1 supported"
         return ret
 
-    def get_normal_input_shape(self):
+    def get_normal_input_shape(self, ind=0):
         ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim")
         ifm_ch = self.get_nodeattr("IFMChannels")
         ishape = (1, ifm_dim_h, ifm_dim_w, ifm_ch)
         return ishape
 
-    def get_folded_input_shape(self):
+    def get_folded_input_shape(self, ind=0):
         ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim")
         ifm_ch = self.get_nodeattr("IFMChannels")
         simd = self.get_nodeattr("SIMD")
@@ -114,7 +114,7 @@ def get_folded_input_shape(self):
         folded_ishape = (1, ifm_dim_h, ifm_dim_w, wf, simd)
         return folded_ishape
 
-    def get_normal_output_shape(self):
+    def get_normal_output_shape(self, ind=0):
         k_h, k_w = self.get_nodeattr("ConvKernelDim")
         ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim")
         ifm_ch = self.get_nodeattr("IFMChannels")
@@ -126,7 +126,7 @@ def get_normal_output_shape(self):
         oshape = (1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch)
         return oshape
 
-    def get_folded_output_shape(self):
+    def get_folded_output_shape(self, ind=0):
         k_h, k_w = self.get_nodeattr("ConvKernelDim")
         ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim")
         ifm_ch = self.get_nodeattr("IFMChannels")
@@ -158,15 +158,15 @@ def infer_node_datatype(self, model):
     def verify_node(self):
         pass
 
-    def get_input_datatype(self):
+    def get_input_datatype(self, ind=0):
         """Returns FINN DataType of input."""
         return DataType[self.get_nodeattr("inputDataType")]
 
-    def get_output_datatype(self):
+    def get_output_datatype(self, ind=0):
         """Returns FINN DataType of output."""
         return DataType[self.get_nodeattr("outputDataType")]
 
-    def get_instream_width(self):
+    def get_instream_width(self, ind=0):
         """Returns stream width, input and output stream width are equal for
         the sliding window function"""
         ibits = self.get_input_datatype().bitwidth()
@@ -176,7 +176,7 @@ def get_instream_width(self):
         in_width = simd * ibits
         return in_width
 
-    def get_outstream_width(self):
+    def get_outstream_width(self, ind=0):
         """Returns stream width, input and output stream width are equal for
         the sliding window function, so the function to determine the input
         stream width can be reused."""
@@ -202,9 +202,7 @@ def get_exp_cycles(self):
         cycles_write_block = (ofm_dim_w * k_w * k_h * (ifm_ch / simd)) / mmv
         cycles_read_block = stride_w * ifm_dim_w * (ifm_ch / simd)
         max_cycles = max(cycles_write_block, cycles_read_block)
-        exp_cycles = (
-            ifm_dim_w * k_h * dilation_h * (ifm_ch / simd) + ofm_dim_h * max_cycles
-        )
+        exp_cycles = ifm_dim_w * k_h * dilation_h * (ifm_ch / simd) + ofm_dim_h * max_cycles
 
         return int(exp_cycles)
 
@@ -401,17 +399,28 @@ def read_npy_data(self):
         npy_in = "%s/input_0.npy" % code_gen_dir
         self.code_gen_dict["$READNPYDATA$"] = []
         self.code_gen_dict["$READNPYDATA$"].append(
-            'npy2apintstream<%s, %s, %d, %s>("%s", in0);'
-            % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in)
+            'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s);'
+            % (
+                packed_hls_type,
+                elem_hls_type,
+                elem_bits,
+                npy_type,
+                npy_in,
+                self.hls_sname(),
+            )
         )
 
     def strm_decl(self):
         self.code_gen_dict["$STREAMDECLARATIONS$"] = []
         self.code_gen_dict["$STREAMDECLARATIONS$"].append(
-            'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.get_instream_width())
+            'hls::stream<ap_uint<{}>> in0_{} ("in0_{}");'.format(
+                self.get_instream_width(), self.hls_sname(), self.hls_sname()
+            )
         )
         self.code_gen_dict["$STREAMDECLARATIONS$"].append(
-            'hls::stream<ap_uint<{}>> out ("out");'.format(self.get_outstream_width())
+            'hls::stream<ap_uint<{}>> out_{} ("out_{}");'.format(
+                self.get_outstream_width(), self.hls_sname(), self.hls_sname()
+            )
         )
 
     def docompute(self):
@@ -436,15 +445,15 @@ def docompute(self):
         if self.get_nodeattr("depthwise") == 1:
             self.code_gen_dict["$DOCOMPUTE$"] = [
                 """{}_dws<ConvKernelDim1, IFMChannels1, Input_precision1, IFMDim1,
-                    OFMDim1, SIMD1, Stride1> (in0, out, numReps, {});""".format(
-                    hls_call, hls_ram_style
+                    OFMDim1, SIMD1, Stride1> (in0_{}, out_{}, numReps, {});""".format(
+                    hls_call, self.hls_sname(), self.hls_sname(), hls_ram_style
                 )
             ]
         else:
             self.code_gen_dict["$DOCOMPUTE$"] = [
                 """{}<ConvKernelDim1, IFMChannels1, Input_precision1, IFMDim1,
-                    OFMDim1, SIMD1, Stride1> (in0, out, numReps, {});""".format(
-                    hls_call, hls_ram_style
+                    OFMDim1, SIMD1, Stride1> (in0_{}, out_{}, numReps, {});""".format(
+                    hls_call, self.hls_sname(), self.hls_sname(), hls_ram_style
                 )
             ]
 
@@ -464,12 +473,13 @@ def dataoutstrm(self):
         oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}")
 
         self.code_gen_dict["$DATAOUTSTREAM$"] = [
-            'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s");'
+            'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s");'
             % (
                 packed_hls_type,
                 elem_hls_type,
                 elem_bits,
                 npy_type,
+                self.hls_sname(),
                 oshape_cpp_str,
                 npy_out,
             )
@@ -480,19 +490,17 @@ def save_as_npy(self):
 
     def blackboxfunction(self):
         self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
-            """void {}(hls::stream<ap_uint<SIMD1*Input_precision1>> &in0,
-                hls::stream<ap_uint<SIMD1*Input_precision1>> &out)""".format(
-                self.onnx_node.name
+            """void {}(hls::stream<ap_uint<SIMD1*Input_precision1>> &in0_{},
+                hls::stream<ap_uint<SIMD1*Input_precision1>> &out_{})""".format(
+                self.onnx_node.name, self.hls_sname(), self.hls_sname()
             )
         ]
 
     def pragmas(self):
         self.code_gen_dict["$PRAGMAS$"] = [
-            "#pragma HLS INTERFACE axis port=in0 name=in0_" + self.hls_sname()
+            "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname()
         ]
         self.code_gen_dict["$PRAGMAS$"].append(
-            "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname()
-        )
-        self.code_gen_dict["$PRAGMAS$"].append(
-            "#pragma HLS INTERFACE ap_ctrl_none port=return"
+            "#pragma HLS INTERFACE axis port=out_" + self.hls_sname()
         )
+        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return")
diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator1d.py b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator1d.py
index aba74baecc..046e8e096d 100644
--- a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator1d.py
+++ b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator1d.py
@@ -59,8 +59,8 @@ class ConvolutionInputGenerator1D(HLSCustomOp):
     attributes (e.g. depthwise or not, whether dilation is 0) a different
     variant will be picked for the actual HLS implementation."""
 
-    def __init__(self, onnx_node):
-        super().__init__(onnx_node)
+    def __init__(self, onnx_node, **kwargs):
+        super().__init__(onnx_node, **kwargs)
 
     def get_nodeattr_types(self):
         my_attrs = {
@@ -91,13 +91,13 @@ def get_nodeattr_types(self):
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs
 
-    def get_normal_input_shape(self):
+    def get_normal_input_shape(self, ind=0):
         ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim")
         ifm_ch = self.get_nodeattr("IFMChannels")
         ishape = (1, ifm_dim_h, ifm_dim_w, ifm_ch)
         return ishape
 
-    def get_folded_input_shape(self):
+    def get_folded_input_shape(self, ind=0):
         ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim")
         ifm_ch = self.get_nodeattr("IFMChannels")
         simd = self.get_nodeattr("SIMD")
@@ -106,7 +106,7 @@ def get_folded_input_shape(self):
         folded_ishape = (1, ifm_dim_h, ifm_dim_w, wf, simd)
         return folded_ishape
 
-    def get_normal_output_shape(self):
+    def get_normal_output_shape(self, ind=0):
         k_h, k_w = self.get_nodeattr("ConvKernelDim")
         ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim")
         ifm_ch = self.get_nodeattr("IFMChannels")
@@ -118,7 +118,7 @@ def get_normal_output_shape(self):
         oshape = (1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch)
         return oshape
 
-    def get_folded_output_shape(self):
+    def get_folded_output_shape(self, ind=0):
         k_h, k_w = self.get_nodeattr("ConvKernelDim")
         ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim")
         ifm_ch = self.get_nodeattr("IFMChannels")
@@ -153,15 +153,15 @@ def infer_node_datatype(self, model):
     def verify_node(self):
         pass
 
-    def get_input_datatype(self):
+    def get_input_datatype(self, ind=0):
         """Returns FINN DataType of input."""
         return DataType[self.get_nodeattr("inputDataType")]
 
-    def get_output_datatype(self):
+    def get_output_datatype(self, ind=0):
         """Returns FINN DataType of output."""
         return DataType[self.get_nodeattr("outputDataType")]
 
-    def get_instream_width(self):
+    def get_instream_width(self, ind=0):
         ibits = self.get_input_datatype().bitwidth()
         simd = self.get_nodeattr("SIMD")
         ifm_ch = self.get_nodeattr("IFMChannels")
@@ -169,7 +169,7 @@ def get_instream_width(self):
         in_width = simd * ibits
         return in_width
 
-    def get_outstream_width(self):
+    def get_outstream_width(self, ind=0):
         if self.use_parallel_window_output():
             # feed all window pixels in parallel
             k_h, k_w = self.get_nodeattr("ConvKernelDim")
@@ -245,13 +245,7 @@ def use_parallel_window_output(self):
         no_dilation = dilation_h == 1 and dilation_w == 1
         supported_ram_style = ram_style in ["auto", "distributed"]
         if self.get_nodeattr("parallel_window") == 1:
-            if (
-                fully_unfolded
-                and non_dws
-                and no_stride
-                and no_dilation
-                and supported_ram_style
-            ):
+            if fully_unfolded and non_dws and no_stride and no_dilation and supported_ram_style:
                 return True
             else:
                 warnings.warn(
@@ -289,10 +283,7 @@ def get_exp_cycles(self):
             "ConvolutionInputGenerator_1D_dws_stride",
         ]:
             exp_cycles = (
-                1
-                + ofm_dim_w * k_w * ifm_ch / simd
-                + (ifm_ch / simd) * (k_w - 1)
-                - (k_w - 1)
+                1 + ofm_dim_w * k_w * ifm_ch / simd + (ifm_ch / simd) * (k_w - 1) - (k_w - 1)
             )
         elif swu_variant == "ConvolutionInputGenerator_1D_dws_naive":
             cycles_read_block = ifm_dim_w * ifm_ch / simd
@@ -337,9 +328,7 @@ def bram_estimation(self):
                 ram_width = 2
             else:
                 ram_width = 1
-            width_mul = math.ceil(
-                simd * self.get_input_datatype().bitwidth() / ram_width
-            )
+            width_mul = math.ceil(simd * self.get_input_datatype().bitwidth() / ram_width)
             depth_mul = math.ceil(ram_depth / 18432)
             return width_mul * depth_mul
         else:
@@ -358,25 +347,17 @@ def lut_estimation(self):
         ram_style = self.get_nodeattr("ram_style")
         swu_variant = self.get_swu_variant()
         if swu_variant == "ConvolutionInputGenerator_1D_parallel":
-            ram_luts = math.ceil(
-                simd * self.get_input_datatype().bitwidth() * (k_w + 1) / 64
-            )
+            ram_luts = math.ceil(simd * self.get_input_datatype().bitwidth() * (k_w + 1) / 64)
         elif ram_style == "distributed":
             if swu_variant == "ConvolutionInputGenerator_1D":
-                ram_luts = math.ceil(
-                    self.get_input_datatype().bitwidth() * (k_w - 1) * ifm_ch / 64
-                )
+                ram_luts = math.ceil(self.get_input_datatype().bitwidth() * (k_w - 1) * ifm_ch / 64)
             elif swu_variant == "ConvolutionInputGenerator_1D_dws_naive":
-                ram_luts = math.ceil(
-                    self.get_input_datatype().bitwidth() * ifm_dim_w * ifm_ch / 64
-                )
+                ram_luts = math.ceil(self.get_input_datatype().bitwidth() * ifm_dim_w * ifm_ch / 64)
             elif swu_variant in [
                 "ConvolutionInputGenerator_1D_dws",
                 "ConvolutionInputGenerator_1D_dws_stride",
             ]:
-                ram_luts = math.ceil(
-                    self.get_input_datatype().bitwidth() * k_w * ifm_ch / 64
-                )
+                ram_luts = math.ceil(self.get_input_datatype().bitwidth() * k_w * ifm_ch / 64)
         else:
             ram_luts = 0
         return 300 + ram_luts
@@ -601,17 +582,28 @@ def read_npy_data(self):
         npy_in = "%s/input_0.npy" % code_gen_dir
         self.code_gen_dict["$READNPYDATA$"] = []
         self.code_gen_dict["$READNPYDATA$"].append(
-            'npy2apintstream<%s, %s, %d, %s>("%s", in0);'
-            % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in)
+            'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s);'
+            % (
+                packed_hls_type,
+                elem_hls_type,
+                elem_bits,
+                npy_type,
+                npy_in,
+                self.hls_sname(),
+            )
         )
 
     def strm_decl(self):
         self.code_gen_dict["$STREAMDECLARATIONS$"] = []
         self.code_gen_dict["$STREAMDECLARATIONS$"].append(
-            'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.get_instream_width())
+            'hls::stream<ap_uint<{}>> in0_{} ("in0_{}");'.format(
+                self.get_instream_width(), self.hls_sname(), self.hls_sname()
+            )
         )
         self.code_gen_dict["$STREAMDECLARATIONS$"].append(
-            'hls::stream<ap_uint<{}>> out ("out");'.format(self.get_outstream_width())
+            'hls::stream<ap_uint<{}>> out_{} ("out_{}");'.format(
+                self.get_outstream_width(), self.hls_sname(), self.hls_sname()
+            )
         )
 
     def docompute(self):
@@ -630,40 +622,40 @@ def docompute(self):
             self.code_gen_dict["$DOCOMPUTE$"] = [
                 """{}<ConvKernelDim1_x, IFMChannels1, Input_precision1,
                 IFMDim1_x, OFMDim1_x, Stride1_x, SIMD1>
-                (in0, out, numReps, {});""".format(
-                    swu_variant, hls_ram_style
+                (in0_{}, out_{}, numReps, {});""".format(
+                    swu_variant, self.hls_sname(), self.hls_sname(), hls_ram_style
                 )
             ]
         if swu_variant == "ConvolutionInputGenerator_1D":
             self.code_gen_dict["$DOCOMPUTE$"] = [
                 """{}<ConvKernelDim1_x, IFMChannels1, Input_precision1,
                 IFMDim1_x, OFMDim1_x, Stride1_x, SIMD1>
-                (in0, out, numReps, {});""".format(
-                    swu_variant, hls_ram_style
+                (in0_{}, out_{}, numReps, {});""".format(
+                    swu_variant, self.hls_sname(), self.hls_sname(), hls_ram_style
                 )
             ]
         if swu_variant == "ConvolutionInputGenerator_1D_dws":
             self.code_gen_dict["$DOCOMPUTE$"] = [
                 """{}<ConvKernelDim1_x, IFMChannels1, Input_precision1,
                 IFMDim1_x, OFMDim1_x, SIMD1>
-                (in0, out, numReps, {});""".format(
-                    swu_variant, hls_ram_style
+                (in0_{}, out_{}, numReps, {});""".format(
+                    swu_variant, self.hls_sname(), self.hls_sname(), hls_ram_style
                 )
             ]
         if swu_variant == "ConvolutionInputGenerator_1D_dws_stride":
             self.code_gen_dict["$DOCOMPUTE$"] = [
                 """{}<ConvKernelDim1_x, IFMChannels1, Input_precision1,
                 IFMDim1_x, OFMDim1_x, Stride1_x, SIMD1>
-                (in0, out, numReps, {});""".format(
-                    swu_variant, hls_ram_style
+                (in0_{}, out_{}, numReps, {});""".format(
+                    swu_variant, self.hls_sname(), self.hls_sname(), hls_ram_style
                 )
             ]
         if swu_variant == "ConvolutionInputGenerator_1D_dws_naive":
             self.code_gen_dict["$DOCOMPUTE$"] = [
                 """{}<ConvKernelDim1_x, IFMChannels1, Input_precision1,
                 IFMDim1_x, OFMDim1_x, Stride1_x, Dilation1_x, SIMD1>
-                (in0, out, numReps, {});""".format(
-                    swu_variant, hls_ram_style
+                (in0_{}, out_{}, numReps, {});""".format(
+                    swu_variant, self.hls_sname(), self.hls_sname(), hls_ram_style
                 )
             ]
 
@@ -690,12 +682,13 @@ def dataoutstrm(self):
             multi_pixel_out = 1
 
         self.code_gen_dict["$DATAOUTSTREAM$"] = [
-            'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s", true, 1, %d);'
+            'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s", true, 1, %d);'
             % (
                 packed_hls_type,
                 elem_hls_type,
                 elem_bits,
                 npy_type,
+                self.hls_sname(),
                 oshape_cpp_str,
                 npy_out,
                 multi_pixel_out,
@@ -708,27 +701,25 @@ def save_as_npy(self):
     def blackboxfunction(self):
         if self.use_parallel_window_output():
             self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
-                """void {}(hls::stream<ap_uint<SIMD1*Input_precision1>> &in0,
+                """void {}(hls::stream<ap_uint<SIMD1*Input_precision1>> &in0_{},
                     hls::stream<ap_uint<ConvKernelDim1_x*SIMD1*Input_precision1>>
-                    &out)""".format(
-                    self.onnx_node.name
+                    &out_{})""".format(
+                    self.onnx_node.name, self.hls_sname(), self.hls_sname()
                 )
             ]
         else:
             self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
-                """void {}(hls::stream<ap_uint<SIMD1*Input_precision1>> &in0,
-                    hls::stream<ap_uint<SIMD1*Input_precision1>> &out)""".format(
-                    self.onnx_node.name
+                """void {}(hls::stream<ap_uint<SIMD1*Input_precision1>> &in0_{},
+                    hls::stream<ap_uint<SIMD1*Input_precision1>> &out_{})""".format(
+                    self.onnx_node.name, self.hls_sname(), self.hls_sname()
                 )
             ]
 
     def pragmas(self):
         self.code_gen_dict["$PRAGMAS$"] = [
-            "#pragma HLS INTERFACE axis port=in0 name=in0_" + self.hls_sname()
+            "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname()
         ]
         self.code_gen_dict["$PRAGMAS$"].append(
-            "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname()
-        )
-        self.code_gen_dict["$PRAGMAS$"].append(
-            "#pragma HLS INTERFACE ap_ctrl_none port=return"
+            "#pragma HLS INTERFACE axis port=out_" + self.hls_sname()
         )
+        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return")
diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py
new file mode 100755
index 0000000000..a55cdcc0be
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py
@@ -0,0 +1,1219 @@
+# Copyright (C) 2022, Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import math
+import numpy as np
+import os
+import shutil
+from qonnx.core.datatype import DataType
+from qonnx.custom_op.general import im2col
+from qonnx.custom_op.general.im2col import compute_conv_output_dim
+
+from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
+from finn.util.basic import get_rtlsim_trace_depth, make_build_dir
+from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
+
+try:
+    from pyverilator import PyVerilator
+except ModuleNotFoundError:
+    PyVerilator = None
+
+# RTL Convolution Input Generator / Sliding Window Generator (SWG)
+# Matches and extends the functionality of all ConvolutionInputGenerator_* functions
+# in finn-hlslib by generating HDL code for two different implementation styles:
+# - Addressable cyclic buffer: to be used when out_width <= in_width
+# - Parallel registers + line buffers: to be used when out_width > in_width
+# Supports non-square, 1D, strided, dilated, and depthwise convolutions.
+# Note: the actual data layout produced is different for depthwise and non-depthwise:
+# * non-depthwise SWG: (1, OFMDim_H, OFMDim_W, K_H, K_W, IFMChannels/SIMD, SIMD)
+# * depthwise SWG: (1, OFMDim_H, OFMDim_W, IFMChannels/SIMD, K_H, K_W, SIMD)
+
+# NOTE: "Parallel" implementation style not yet implemented in this version!
+
+
+class ConvolutionInputGenerator_rtl(HLSCustomOp):
+    """Class that does not correspond to one of the finn-hlslib ConvolutionInputGenerator
+    (sliding window) function variants. Generates an RTL ConvolutionInputGenerator
+    implementation based on (System-)Verilog templates, defined in finn-rtllib/swg."""
+
+    def __init__(self, onnx_node, **kwargs):
+        super().__init__(onnx_node, **kwargs)
+
+    def get_nodeattr_types(self):
+        my_attrs = {
+            "ConvKernelDim": ("ints", True, []),  # [H, W] = [Y, X]
+            "IFMChannels": ("i", True, 0),
+            "IFMDim": ("ints", True, []),  # [H, W] = [Y, X]
+            "OFMDim": ("ints", True, []),  # [H, W] = [Y, X]
+            "SIMD": ("i", True, 0),
+            # additional parallelization parameter - not yet implemented
+            "M": ("i", False, 1),
+            # Enable parallel window output (requires full SIMD unfolding)
+            "parallel_window": ("i", False, 0, {0, 1}),
+            "Stride": ("ints", True, []),  # [H, W] = [Y, X]
+            "Dilation": ("ints", True, []),  # [H, W] = [Y, X]
+            # FINN DataTypes for inputs, weights, outputs
+            "inputDataType": ("s", True, ""),
+            "outputDataType": ("s", True, ""),
+            "depthwise": ("i", False, 0, {0, 1}),
+            # Enable reprogrammable implementation to change FM dimensions,
+            # stride, or dilation during runtime (requires parallel_window = 0)
+            "dynamic_mode": ("i", False, 0, {0, 1}),
+            # FPGA resource type for ConvolutionInputGenerator input buffer
+            # auto -- let Vivado decide
+            # block -- use BRAM
+            # distributed -- use LUTRAM
+            # ultra -- use URAM
+            "ram_style": (
+                "s",
+                False,
+                "auto",
+                {"auto", "block", "distributed", "ultra"},
+            ),
+            # attribute to save top module name - not user configurable
+            "gen_top_module": ("s", False, ""),
+        }
+        my_attrs.update(super().get_nodeattr_types())
+        return my_attrs
+
+    def get_normal_input_shape(self, ind=0):
+        ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim")
+        ifm_ch = self.get_nodeattr("IFMChannels")
+        ishape = (1, ifm_dim_h, ifm_dim_w, ifm_ch)
+        return ishape
+
+    def get_folded_input_shape(self, ind=0):
+        ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim")
+        ifm_ch = self.get_nodeattr("IFMChannels")
+        simd = self.get_nodeattr("SIMD")
+        assert ifm_ch % simd == 0, "SIMD must divide IFMChannels"
+        wf = int(ifm_ch / simd)
+        folded_ishape = (1, ifm_dim_h, ifm_dim_w, wf, simd)
+        return folded_ishape
+
+    def get_normal_output_shape(self, ind=0):
+        k_h, k_w = self.get_nodeattr("ConvKernelDim")
+        ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim")
+        ifm_ch = self.get_nodeattr("IFMChannels")
+        stride_h, stride_w = self.get_nodeattr("Stride")
+        dilation_h, dilation_w = self.get_nodeattr("Dilation")
+        pad = 0
+        ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, pad, dilation_h)
+        ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, pad, dilation_w)
+        oshape = (1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch)
+        return oshape
+
+    def get_folded_output_shape(self, ind=0):
+        k_h, k_w = self.get_nodeattr("ConvKernelDim")
+        ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim")
+        ifm_ch = self.get_nodeattr("IFMChannels")
+        stride_h, stride_w = self.get_nodeattr("Stride")
+        dilation_h, dilation_w = self.get_nodeattr("Dilation")
+        simd = self.get_nodeattr("SIMD")
+        pad = 0
+        ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, pad, dilation_h)
+        ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, pad, dilation_w)
+        assert ifm_ch % simd == 0, "SIMD must divide IFMChannels"
+        if self.get_nodeattr("parallel_window"):
+            wf = int((ifm_ch) // simd)
+            folded_oshape = (1, ofm_dim_h, ofm_dim_w, wf, k_h * k_w * simd)
+        else:
+            wf = int((k_h * k_w * ifm_ch) // simd)
+            folded_oshape = (1, ofm_dim_h, ofm_dim_w, wf, simd)
+        return folded_oshape
+
+    def make_shape_compatible_op(self, model):
+        exp_ishape = self.get_normal_input_shape()
+        oshape = self.get_normal_output_shape()
+        ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0]))
+        assert ishape == exp_ishape, "Unexpect input shape for ConvInpGen."
+        return super().make_const_shape_op(oshape)
+
+    def infer_node_datatype(self, model):
+        node = self.onnx_node
+        # data type stays the same
+        dtype = model.get_tensor_datatype(node.input[0])
+        model.set_tensor_datatype(node.output[0], dtype)
+
+    def verify_node(self):
+        pass
+
+    def get_input_datatype(self, ind=0):
+        """Returns FINN DataType of input."""
+        return DataType[self.get_nodeattr("inputDataType")]
+
+    def get_output_datatype(self, ind=0):
+        """Returns FINN DataType of output."""
+        return DataType[self.get_nodeattr("outputDataType")]
+
+    def get_instream_width(self, ind=0):
+        ibits = self.get_input_datatype().bitwidth()
+        simd = self.get_nodeattr("SIMD")
+        ifm_ch = self.get_nodeattr("IFMChannels")
+        assert ifm_ch % simd == 0, "SIMD must divide IFMChannels"
+        in_width = simd * ibits
+        return in_width
+
+    def get_outstream_width(self, ind=0):
+        if self.get_nodeattr("parallel_window"):
+            # feed all window pixels in parallel
+            k_h, k_w = self.get_nodeattr("ConvKernelDim")
+            return self.get_instream_width() * k_h * k_w
+        else:
+            # if parallel variant not in use: same width for output and input stream
+            return self.get_instream_width()
+
+    def get_number_input_values(self):
+        """Function to get the number of expected input values."""
+        folded_ishape = self.get_folded_input_shape()
+        num_input_elems = np.prod(folded_ishape[:-1])
+        return num_input_elems
+
+    def get_number_output_values(self):
+        folded_oshape = self.get_folded_output_shape()
+        num_output_elems = np.prod(folded_oshape[:-1])
+        return num_output_elems
+
+    def get_1d_conv_attrs_normalized(self):
+        """Returns normalized spatial attributes, where H=1 for the 1D case."""
+        # normalize FM dimensions so that:
+        # [H, W] = [Y, X] = [1, D] or [D, 1] are always mapped to [1, D].
+        # The dummy ('1') dimension is the Y-dimension.
+        ifm_ch = self.get_nodeattr("IFMChannels")
+        k = self.get_nodeattr("ConvKernelDim")
+        ifm_dim = self.get_nodeattr("IFMDim")
+        ofm_dim = self.get_nodeattr("OFMDim")
+        stride = self.get_nodeattr("Stride")
+        dilation = self.get_nodeattr("Dilation")
+
+        if ifm_dim[1] == 1:
+            ifm_dim = ifm_dim[::-1]
+            ofm_dim = ofm_dim[::-1]
+            k = k[::-1]
+            stride = stride[::-1]
+            dilation = dilation[::-1]
+
+        return (ifm_ch, ifm_dim, ofm_dim, k, stride, dilation)
+
+    def get_buffer_depth(self):
+        """Returns total depth of the internal buffer, depending on
+        implementation style."""
+        ifm_ch = self.get_nodeattr("IFMChannels")
+        k = self.get_nodeattr("ConvKernelDim")
+        ifm_dim = self.get_nodeattr("IFMDim")
+        stride = self.get_nodeattr("Stride")
+        dilation = self.get_nodeattr("Dilation")
+        simd = self.get_nodeattr("SIMD")
+
+        k_h, k_w = k
+        h, w = ifm_dim
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+        mmv_in = 1
+        mmv_out = 1
+        channel_factor = int(ifm_ch / simd)
+
+        # compute minimal buffer length (assuming it holds 1 complete window)
+        buffer_min_size = ((k_h - 1) * dilation_h * w + (k_w - 1) * dilation_w + 1) * channel_factor
+
+        impl_style = self.select_impl_style()
+        if impl_style == "default":
+            # add additional buffer space in case of stride > 1
+            # this minimizes cycle count as it allows an earlier pre-load of inputs
+            buffer_depth = (
+                buffer_min_size
+                + max(
+                    0,
+                    ((stride_w - 1) - (int(mmv_out * k_h * k_w / mmv_in))) * channel_factor,
+                )
+                + max(
+                    0,
+                    ((stride_h - 1) * w - (int(mmv_out * k_h * k_w / mmv_in))) * channel_factor,
+                )
+            )
+        elif impl_style == "parallel":
+            buffer_depth = buffer_min_size + 1
+        return buffer_depth
+
+    def get_exp_cycles(self):
+        impl_style = self.select_impl_style()
+
+        if impl_style == "parallel":
+            exp_cycles = self.get_number_input_values() + 2
+        elif impl_style == "default":
+            simd = self.get_nodeattr("SIMD")
+            ifm_ch = self.get_nodeattr("IFMChannels")
+            k = self.get_nodeattr("ConvKernelDim")
+            ifm_dim = self.get_nodeattr("IFMDim")
+            ofm_dim = self.get_nodeattr("OFMDim")
+            stride = self.get_nodeattr("Stride")
+            dilation = self.get_nodeattr("Dilation")
+            depthwise = self.get_nodeattr("depthwise")
+            ifm_dim_h, ifm_dim_w = ifm_dim
+            ofm_dim_h, ofm_dim_w = ofm_dim
+            k_h, k_w = k
+            stride_h, stride_w = stride
+            dilation_h, dilation_w = dilation
+
+            channel_factor = int(ifm_ch / simd)
+            if ifm_dim_h == 1 or ifm_dim_w == 1:
+                # 1D case
+                (
+                    ifm_ch,
+                    [ifm_dim_h, ifm_dim_w],
+                    [ofm_dim_h, ofm_dim_w],
+                    [k_h, k_w],
+                    [stride_h, stride_w],
+                    [dilation_h, dilation_w],
+                ) = self.get_1d_conv_attrs_normalized()
+
+                if depthwise:
+                    exp_cycles = (
+                        +ofm_dim_w * k_w * channel_factor
+                        + channel_factor * (k_w - 1) * (stride_w - 1)
+                        - (k_w - 1)
+                        + 2
+                    )
+                else:
+                    exp_cycles = ofm_dim_w * k_w * channel_factor + 2
+            else:
+                # 2D case
+                buffer_min_size = (
+                    (k_h - 1) * dilation_h * ifm_dim_w + (k_w - 1) * dilation_w + 1
+                ) * channel_factor
+                cycles_write_block = ofm_dim_w * k_w * k_h * channel_factor
+                cycles_read_block = stride_w * ifm_dim_w * channel_factor
+                max_cycles = max(cycles_write_block, cycles_read_block)
+                if depthwise:
+                    max_cycles += ofm_dim_w * (stride_w - 1) * (channel_factor - 1)
+                exp_cycles = buffer_min_size + ofm_dim_h * max_cycles
+                if depthwise:
+                    exp_cycles += (stride_h - 1) * ifm_dim_w * channel_factor
+
+        return int(exp_cycles)
+
+    def bram_estimation(self):
+        simd = self.get_nodeattr("SIMD")
+        ram_style = self.get_nodeattr("ram_style")
+        impl_style = self.select_impl_style()
+        [k_h, k_w] = self.get_nodeattr("ConvKernelDim")
+        [ifm_dim_h, ifm_dim_w] = self.get_nodeattr("IFMDim")
+        [dilation_h, dilation_w] = self.get_nodeattr("Dilation")
+
+        if ram_style == "block" or ram_style == "auto":
+            buffer_width = simd * self.get_input_datatype().bitwidth()
+            if impl_style == "default":
+                buffer_depth = self.get_buffer_depth()
+                buffer_count = 1
+            elif impl_style == "parallel":
+                if ifm_dim_h == 1 or ifm_dim_w == 1:
+                    return 0  # 1D case (no line buffers needed)
+                kernel_width = (k_w - 1) * dilation_w + 1
+                buffer_depth = (ifm_dim_w - kernel_width) + ifm_dim_w * (dilation_h - 1)
+                buffer_count = k_h - 1
+
+            # NOTE: Actual BRAM usage might be lower in some cases
+            # due to imperfect modeling of Vivado behavior
+            if buffer_depth <= 512:
+                ram_width = 36
+            elif buffer_depth <= 1024:
+                ram_width = 18
+            elif buffer_depth <= 2048:
+                ram_width = 9
+            elif buffer_depth <= 4096:
+                ram_width = 4
+            elif buffer_depth <= 8192:
+                ram_width = 2
+            else:
+                ram_width = 1
+
+            ram_cascade_depth = math.ceil(buffer_depth / 16384)
+            ram_cascade_width = math.ceil(buffer_width / ram_width)
+            cascade_savings = 0
+            if buffer_depth > 16384:
+                remainder_depth = buffer_depth % 16384
+                if remainder_depth <= 512:
+                    remainder_width = 36
+                elif remainder_depth <= 1024:
+                    remainder_width = 18
+                elif remainder_depth <= 2048:
+                    remainder_width = 9
+                elif remainder_depth <= 4096:
+                    remainder_width = 4
+                elif remainder_depth <= 8192:
+                    remainder_width = 2
+                else:
+                    remainder_width = 1
+
+                remainder_cascade_width = math.ceil(buffer_width / remainder_width)
+                cascade_savings = ram_cascade_width - remainder_cascade_width
+
+            return int((ram_cascade_depth * ram_cascade_width - cascade_savings) * buffer_count)
+        else:
+            return 0
+
+    def lut_estimation(self):
+        simd = self.get_nodeattr("SIMD")
+        ram_style = self.get_nodeattr("ram_style")
+        buffer_width = simd * self.get_input_datatype().bitwidth()
+        buffer_depth = self.get_buffer_depth()
+        if ram_style == "distributed":
+            ram_luts = int(buffer_width * math.ceil(buffer_depth / 38))
+        else:
+            ram_luts = 0
+        return 300 + ram_luts
+
+    def uram_estimation(self):
+        simd = self.get_nodeattr("SIMD")
+        ram_style = self.get_nodeattr("ram_style")
+        impl_style = self.select_impl_style()
+        [k_h, k_w] = self.get_nodeattr("ConvKernelDim")
+        [ifm_dim_h, ifm_dim_w] = self.get_nodeattr("IFMDim")
+        [dilation_h, dilation_w] = self.get_nodeattr("Dilation")
+
+        if ram_style == "ultra":
+            buffer_width = simd * self.get_input_datatype().bitwidth()
+            if impl_style == "default":
+                buffer_depth = self.get_buffer_depth()
+                buffer_count = 1
+            elif impl_style == "parallel":
+                if ifm_dim_h == 1 or ifm_dim_w == 1:
+                    return 0  # 1D case (no line buffers needed)
+                kernel_width = (k_w - 1) * dilation_w + 1
+                buffer_depth = (ifm_dim_w - kernel_width) + ifm_dim_w * (dilation_h - 1)
+                buffer_count = k_h - 1
+
+            ram_depth = 4096
+            ram_width = 72
+            ram_cascade_depth = math.ceil(buffer_depth / ram_depth)
+            ram_cascade_width = math.ceil(buffer_width / ram_width)
+            return int(ram_cascade_depth * ram_cascade_width * buffer_count)
+        else:
+            return 0
+
+    def execute_node(self, context, graph):
+        mode = self.get_nodeattr("exec_mode")
+        node = self.onnx_node
+        exp_ishape = self.get_normal_input_shape()
+        exp_oshape = self.get_normal_output_shape()
+        folded_ishape = self.get_folded_input_shape()
+
+        if mode == "cppsim":
+            raise Exception("cppsim not possible for RTL SWG, please set exec_mode to rtlsim")
+        elif mode == "rtlsim":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+                    mode
+                )
+            )
+
+        inp = context[node.input[0]]
+        assert str(inp.dtype) == "float32", "Input datatype is not float32"
+        assert (
+            inp.shape == exp_ishape
+        ), """Input shape doesn't match expected shape (1, ifm_dim, ifm_dim, ifm_ch)."""
+        if self.get_input_datatype() == DataType["BIPOLAR"]:
+            # store bipolar activations as binary
+            inp = (inp + 1) / 2
+            export_idt = DataType["BINARY"]
+        else:
+            export_idt = self.get_input_datatype()
+
+        # reshape input into folded form
+        inp = inp.reshape(folded_ishape)
+        # make copy before saving array
+        reshaped_input = inp.copy()
+        np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input)
+
+        sim = self.get_rtlsim()
+        nbits = self.get_instream_width()
+        rtlsim_inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits)
+        super().reset_rtlsim(sim)
+        super().toggle_clk(sim)
+        rtlsim_output = self.rtlsim(sim, rtlsim_inp)
+        odt = export_idt
+        target_bits = odt.bitwidth()
+        packed_bits = self.get_outstream_width()
+        out_npy_path = "{}/output.npy".format(code_gen_dir)
+        out_shape = self.get_folded_output_shape()
+        rtlsim_output_to_npy(rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits)
+        # load and reshape output
+        output = np.load(out_npy_path)
+        output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape)
+        context[node.output[0]] = output
+
+        # binary -> bipolar if needed
+        if self.get_output_datatype() == DataType["BIPOLAR"]:
+            out = context[node.output[0]]
+            out = 2 * out - 1
+            context[node.output[0]] = out
+        assert (
+            context[node.output[0]].shape == exp_oshape
+        ), """Output
+        shape doesn't match expected shape (1, ofm_dim_h, ofm_dim_w, k_h*k_w*ifm_ch)."""
+
+    def prepare_codegen_default(self):
+        """Fills code generation dict for the default implementation style by computing
+        the incremental addressing scheme for the circular buffer."""
+        if self.get_nodeattr("dynamic_mode"):
+            template_select = "/finn-rtllib/swg/swg_template_default_dynamic.sv"
+        else:
+            template_select = "/finn-rtllib/swg/swg_template_default.sv"
+        template_path = os.environ["FINN_ROOT"] + template_select
+        code_gen_dict = {}
+
+        ifm_ch = self.get_nodeattr("IFMChannels")
+        k = self.get_nodeattr("ConvKernelDim")
+        ifm_dim = self.get_nodeattr("IFMDim")
+        stride = self.get_nodeattr("Stride")
+        dilation = self.get_nodeattr("Dilation")
+        depthwise = self.get_nodeattr("depthwise")
+        simd = self.get_nodeattr("SIMD")
+
+        k_h, k_w = k
+        h, w = ifm_dim
+        pad = [0, 0, 0, 0]  # padding happens in separate padding node for now
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+        pad_h = pad[0] + pad[2]
+        pad_w = pad[1] + pad[3]
+        out_dim_h = im2col.compute_conv_output_dim(h, k_h, stride_h, pad_h, dilation_h)
+        out_dim_w = im2col.compute_conv_output_dim(w, k_w, stride_w, pad_w, dilation_w)
+        mmv_in = 1
+        mmv_out = 1
+        channel_factor = int(ifm_ch / simd)
+
+        # compute minimal buffer length (assuming it holds 1 complete window)
+        buffer_min_size = ((k_h - 1) * dilation_h * w + (k_w - 1) * dilation_w + 1) * channel_factor
+
+        buffer_actual_size = self.get_buffer_depth()
+        code_gen_dict["$BUF_ELEM_TOTAL$"] = [str(buffer_actual_size)]
+
+        # compute some intermediate values, e.g., kernel "width" = k_w incl. dilation
+        # or cols/rows that are skipped due to imperfect stride<->dim combination
+        kernel_width = (k_w - 1) * dilation_w + 1
+        kernel_height = (k_h - 1) * dilation_h + 1
+        skip_columns = w % (kernel_width + (out_dim_w - 1) * stride_w)
+        skip_rows = h % (kernel_height + (out_dim_h - 1) * stride_h)
+
+        # compute address increment values for 5-loop nest
+        addr_incr_end_simd = 1
+        addr_incr_end_window_elem = (dilation_w - 1) * channel_factor + 1
+        addr_incr_end_window_row = (
+            ((w - kernel_width) * channel_factor)  # remaining line
+            + ((dilation_h - 1) * w * channel_factor)  # skip lines
+            + 1  # wrap-around of minimally sized buffer
+        )
+        addr_incr_end_window = -buffer_min_size + stride_w * channel_factor + 1
+        addr_incr_end_row = (
+            -buffer_min_size
+            + ((skip_columns + kernel_width) * channel_factor)  # remaining line
+            + ((stride_h - 1) * w * channel_factor)  # skip lines
+            + 1
+        )
+
+        # re-use same controller structure -> re-assign address increments
+        if depthwise:
+            addr_incr_end_window_elem = dilation_w * channel_factor
+            addr_incr_end_window_row = (
+                channel_factor
+                + (w - kernel_width) * channel_factor
+                + (dilation_h - 1) * w * channel_factor
+            )
+            addr_incr_end_simd = -buffer_min_size + (channel_factor + 1)
+
+        # sanity check for wrap logic
+        assert not (
+            abs(addr_incr_end_window) > buffer_actual_size
+        ), "ERROR: W increment > buffer size, try setting parallel_window=1"
+        assert not (
+            abs(addr_incr_end_row) > buffer_actual_size
+        ), "ERROR: H increment > buffer size, try setting parallel_window=1"
+
+        # set certain threshold indices to detect when reading/writing finishes
+        code_gen_dict["$LAST_READ_ELEM$"] = [str(h * w * channel_factor - 1)]
+        code_gen_dict["$LAST_WRITE_ELEM$"] = [
+            str(((h - skip_rows - 1) * w + (w - skip_columns)) * channel_factor - 1)
+        ]
+
+        # default controller loop structure: # iterations (counters) map directly
+        loop_h_iterations = out_dim_h
+        loop_w_iterations = out_dim_w
+        loop_kh_iterations = k_h
+        loop_kw_iterations = k_w
+        loop_simd_iterations = channel_factor
+
+        if depthwise and channel_factor > 1:
+            # re-arrange existing controller loop structure for depthwise convolutions
+            loop_kh_iterations = channel_factor
+            loop_kw_iterations = k_h
+            loop_simd_iterations = k_w
+            addr_incr_end_simd_ = addr_incr_end_simd
+            addr_incr_end_simd = addr_incr_end_window_elem
+            addr_incr_end_window_elem = addr_incr_end_window_row
+            addr_incr_end_window_row = addr_incr_end_simd_
+            elem_per_window = k_h * k_w
+
+            tail_incr_w = addr_incr_end_window + buffer_min_size - channel_factor
+            tail_incr_h = addr_incr_end_row + buffer_min_size - channel_factor
+            tail_incr_last_window = buffer_min_size - 1
+            code_gen_dict["$IS_DEPTHWISE$"] = ["1"]
+        else:
+            # depthwise output format is equivalent to non-depthwise if SIMD=C
+            elem_per_window = k_h * k_w * channel_factor
+
+            tail_incr_w = addr_incr_end_window + buffer_min_size - 1
+            tail_incr_h = addr_incr_end_row + buffer_min_size - 1
+            tail_incr_last_window = buffer_min_size - 1
+            code_gen_dict["$IS_DEPTHWISE$"] = ["0"]
+
+        # support SIMD = IFMChannels and k_w = 1 cases
+        # for k = [k_h, k_w] = [1, k_w], no adjustment is needed
+        # for k = [k_h, k_w] = [1, 1], do not use this impl. style (mmv_out=K=1)
+        # innermost loop is executed at least once -> adjust if needed
+        if loop_simd_iterations == 1:
+            # skip innermost SIMD loop completely
+            if loop_kw_iterations == 1:
+                # skip innermost KW loop completely
+                code_gen_dict["$INNERMOST_STATE$"] = ["STATE_LOOP_KH"]
+                loop_kh_iterations -= 1  # -1 because state is initial state
+            else:
+                code_gen_dict["$INNERMOST_STATE$"] = ["STATE_LOOP_KW"]
+                loop_kw_iterations -= 1  # -1 because state is initial state
+        else:
+            code_gen_dict["$INNERMOST_STATE$"] = ["STATE_LOOP_SIMD"]
+            loop_simd_iterations -= 1  # -1 because state is initial state
+
+        cntr_bitwidth = math.ceil(
+            math.log2(
+                max(
+                    loop_h_iterations - 2 + 1,
+                    loop_w_iterations - 2 + 1,
+                    loop_kh_iterations - 2 + 1,
+                    loop_kw_iterations - 2 + 1,
+                    loop_simd_iterations - 2 + 1,
+                )
+            )
+        )
+        code_gen_dict["$CNTR_BITWIDTH$"] = [str(cntr_bitwidth)]
+        code_gen_dict["$LOOP_H_ITERATIONS$"] = [str(loop_h_iterations - 2)]
+        code_gen_dict["$LOOP_W_ITERATIONS$"] = [str(loop_w_iterations - 2)]
+        code_gen_dict["$LOOP_KH_ITERATIONS$"] = [str(loop_kh_iterations - 2)]
+        code_gen_dict["$LOOP_KW_ITERATIONS$"] = [str(loop_kw_iterations - 2)]
+        code_gen_dict["$LOOP_SIMD_ITERATIONS$"] = [str(loop_simd_iterations - 2)]
+
+        incr_bitwidth = 1 + math.ceil(
+            math.log2(
+                max(
+                    abs(addr_incr_end_simd) + 1,
+                    abs(addr_incr_end_window_elem) + 1,
+                    abs(addr_incr_end_window_row) + 1,
+                    abs(addr_incr_end_window) + 1,
+                    abs(addr_incr_end_row) + 1,
+                    abs(tail_incr_w) + 1,
+                    abs(tail_incr_h) + 1,
+                    abs(tail_incr_last_window) + 1,
+                )
+            )
+        )
+        code_gen_dict["$INCR_BITWIDTH$"] = [str(incr_bitwidth)]
+        code_gen_dict["$HEAD_INCR_SIMD$"] = [str(addr_incr_end_simd)]
+        code_gen_dict["$HEAD_INCR_KW$"] = [str(addr_incr_end_window_elem)]
+        code_gen_dict["$HEAD_INCR_KH$"] = [str(addr_incr_end_window_row)]
+        code_gen_dict["$HEAD_INCR_W$"] = [str(addr_incr_end_window)]
+        code_gen_dict["$HEAD_INCR_H$"] = [str(addr_incr_end_row)]
+        code_gen_dict["$TAIL_INCR_W$"] = [str(tail_incr_w)]
+        code_gen_dict["$TAIL_INCR_H$"] = [str(tail_incr_h)]
+        code_gen_dict["$TAIL_INCR_LAST$"] = [str(tail_incr_last_window)]
+
+        code_gen_dict["$ELEM_PER_WINDOW$"] = [str(elem_per_window)]
+        code_gen_dict["$SIMD$"] = [str(simd)]
+        code_gen_dict["$MMV_IN$"] = [str(mmv_in)]
+        code_gen_dict["$MMV_OUT$"] = [str(mmv_out)]
+
+        return template_path, code_gen_dict
+
+    def prepare_codegen_parallel(self):
+        """Fills code generation dict for the parallel implementation style by computing
+        the loop controller configuration and partitioning the fixed buffer into
+        shift-registers (for parallel read access) and line buffers (for efficient
+        LUTRAM/BRAM/URAM implementation)."""
+        template_path = os.environ["FINN_ROOT"] + "/finn-rtllib/swg/swg_template_parallel.sv"
+        code_gen_dict = {}
+
+        ifm_ch = self.get_nodeattr("IFMChannels")
+        k = self.get_nodeattr("ConvKernelDim")
+        ifm_dim = self.get_nodeattr("IFMDim")
+        stride = self.get_nodeattr("Stride")
+        dilation = self.get_nodeattr("Dilation")
+        simd = self.get_nodeattr("SIMD")
+        M = self.get_nodeattr("M")
+
+        k_h, k_w = k
+        h, w = ifm_dim
+        pad = [0, 0, 0, 0]  # padding happens in separate padding node for now
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+        pad_h = pad[0] + pad[2]
+        pad_w = pad[1] + pad[3]
+        out_dim_h = im2col.compute_conv_output_dim(h, k_h, stride_h, pad_h, dilation_h)
+        out_dim_w = im2col.compute_conv_output_dim(w, k_w, stride_w, pad_w, dilation_w)
+        mmv_in = M * 1
+        mmv_out = M * k_h * k_w
+        channel_factor = int(ifm_ch / simd)
+
+        # compute minimal buffer length (assuming it holds 1 complete window)
+        buffer_min_size = ((k_h - 1) * dilation_h * w + (k_w - 1) * dilation_w + 1) * channel_factor
+
+        buffer_actual_size = self.get_buffer_depth()
+        code_gen_dict["$BUF_ELEM_TOTAL$"] = [str(buffer_actual_size)]
+
+        # compute some intermediate values, e.g., kernel "width" = k_w incl. dilation
+        # or cols/rows that are skipped due to imperfect stride<->dim combination
+        kernel_width = (k_w - 1) * dilation_w + 1
+        kernel_height = (k_h - 1) * dilation_h + 1
+        skip_columns = w % (kernel_width + (out_dim_w - 1) * stride_w)
+        skip_rows = h % (kernel_height + (out_dim_h - 1) * stride_h)
+
+        # set certain threshold indices to detect when reading/writing finishes
+        code_gen_dict["$LAST_READ_ELEM$"] = [str(h * w * channel_factor - 1)]
+        code_gen_dict["$LAST_WRITE_ELEM$"] = [
+            str(((h - skip_rows - 1) * w + (w - skip_columns)) * channel_factor - 1)
+        ]
+
+        # re-use default controller loop structure
+        code_gen_dict["$IS_DEPTHWISE$"] = ["0"]
+        loop_h_iterations = out_dim_h
+        loop_w_iterations = out_dim_w  # now the innermost loop
+        loop_kh_iterations = 1
+        loop_kw_iterations = 1
+        loop_simd_iterations = 1
+
+        if loop_w_iterations == 1:
+            code_gen_dict["$INNERMOST_STATE$"] = ["STATE_LOOP_H"]
+            loop_h_iterations -= 1  # -1 because state is initial state
+        else:
+            code_gen_dict["$INNERMOST_STATE$"] = ["STATE_LOOP_W"]
+            loop_w_iterations -= 1  # -1 because state is initial state
+
+        # set head and tail address increment values
+        addr_incr_end_window = -buffer_min_size + stride_w * channel_factor + 1
+        addr_incr_end_row = (
+            -buffer_min_size
+            + ((skip_columns + kernel_width) * channel_factor)  # remaining line
+            + ((stride_h - 1) * w * channel_factor)  # skip lines
+            + 1
+        )
+
+        tail_incr_w = addr_incr_end_window + buffer_min_size - 1
+        tail_incr_h = addr_incr_end_row + buffer_min_size - 1
+        tail_incr_last_window = stride_w
+
+        addr_incr_end_simd = 1
+        addr_incr_end_window_elem = 1
+        addr_incr_end_window_row = 1
+        addr_incr_end_window = tail_incr_w
+        addr_incr_end_row = tail_incr_h
+
+        # add init value for CURRENT_ELEM counter = last elem of first window
+        code_gen_dict["$FIRST_WRITE_ELEM$"] = [str(buffer_min_size - 1)]
+
+        cntr_bitwidth = math.ceil(
+            math.log2(
+                max(
+                    loop_h_iterations - 2 + 1,
+                    loop_w_iterations - 2 + 1,
+                    loop_kh_iterations - 2 + 1,
+                    loop_kw_iterations - 2 + 1,
+                    loop_simd_iterations - 2 + 1,
+                )
+            )
+        )
+        code_gen_dict["$CNTR_BITWIDTH$"] = [str(cntr_bitwidth)]
+        code_gen_dict["$LOOP_H_ITERATIONS$"] = [str(loop_h_iterations - 2)]
+        code_gen_dict["$LOOP_W_ITERATIONS$"] = [str(loop_w_iterations - 2)]
+        code_gen_dict["$LOOP_KH_ITERATIONS$"] = [str(loop_kh_iterations - 2)]
+        code_gen_dict["$LOOP_KW_ITERATIONS$"] = [str(loop_kw_iterations - 2)]
+        code_gen_dict["$LOOP_SIMD_ITERATIONS$"] = [str(loop_simd_iterations - 2)]
+
+        incr_bitwidth = 1 + math.ceil(
+            math.log2(
+                max(
+                    abs(addr_incr_end_simd) + 1,
+                    abs(addr_incr_end_window_elem) + 1,
+                    abs(addr_incr_end_window_row) + 1,
+                    abs(addr_incr_end_window) + 1,
+                    abs(addr_incr_end_row) + 1,
+                    abs(tail_incr_w) + 1,
+                    abs(tail_incr_h) + 1,
+                    abs(tail_incr_last_window) + 1,
+                )
+            )
+        )
+        code_gen_dict["$INCR_BITWIDTH$"] = [str(incr_bitwidth)]
+        code_gen_dict["$HEAD_INCR_SIMD$"] = [str(addr_incr_end_simd)]
+        code_gen_dict["$HEAD_INCR_KW$"] = [str(addr_incr_end_window_elem)]
+        code_gen_dict["$HEAD_INCR_KH$"] = [str(addr_incr_end_window_row)]
+        code_gen_dict["$HEAD_INCR_W$"] = [str(addr_incr_end_window)]
+        code_gen_dict["$HEAD_INCR_H$"] = [str(addr_incr_end_row)]
+        code_gen_dict["$TAIL_INCR_W$"] = [str(tail_incr_w)]
+        code_gen_dict["$TAIL_INCR_H$"] = [str(tail_incr_h)]
+        code_gen_dict["$TAIL_INCR_LAST$"] = [str(tail_incr_last_window)]
+
+        code_gen_dict["$SIMD$"] = [str(simd)]
+        code_gen_dict["$MMV_IN$"] = [str(mmv_in)]
+        code_gen_dict["$MMV_OUT$"] = [str(mmv_out)]
+
+        # prepare buffer partitioning into "reg_fifos" and "bram_fifos"
+        # use normalized ([H,W]=[1,W]) dimensions for 1D case
+        (
+            ifm_ch,
+            [ifm_dim_h, ifm_dim_w],
+            [ofm_dim_h, ofm_dim_w],
+            [k_h, k_w],
+            [stride_h, stride_w],
+            [dilation_h, dilation_w],
+        ) = self.get_1d_conv_attrs_normalized()
+
+        reg_fifos = []
+        bram_fifos_depth = []
+
+        px_idx = 0
+        for ky in range(k_h):
+            reg_fifo = []
+            for kx in range(k_w):
+                reg_fifo.append(px_idx)
+                px_idx += 1
+                if kx < (k_w - 1):
+                    reg_fifo.extend([-1] * (dilation_w - 1))
+                    px_idx += dilation_w - 1
+            reg_fifos.append(reg_fifo)
+
+            if ky < (k_h - 1):
+                line_buffer_len = (w - kernel_width) + w * (dilation_h - 1)
+                bram_fifos_depth.append(line_buffer_len)
+                px_idx += line_buffer_len
+
+        code_gen_dict["$GENERATE_REG_FIFOS$"] = []
+        for i, reg_fifo in enumerate(reg_fifos):
+            code_gen_dict["$GENERATE_REG_FIFOS$"].append(
+                """
+                wire [IN_WIDTH-1:0] reg_fifo_{id}_in;
+                wire [IN_WIDTH-1:0] reg_fifo_{id}_out;
+                wire [IN_WIDTH*{len}-1:0] reg_fifo_{id};
+                swg_reg_buffer
+                #(
+                .WIDTH(IN_WIDTH),
+                .DEPTH({len})
+                )
+                reg_buffer_inst_{id}
+                (
+                    .clk(clk),
+                    .shift_enable(shift_enable),
+                    .shift_in(reg_fifo_{id}_in),
+                    .shift_out(reg_fifo_{id}_out),
+                    .data_out(reg_fifo_{id})
+                );""".format(
+                    id=i,
+                    len=len(reg_fifo),
+                )
+            )
+
+        code_gen_dict["$GENERATE_BRAM_FIFOS$"] = []
+        for i, bram_fifo_depth in enumerate(bram_fifos_depth):
+            code_gen_dict["$GENERATE_BRAM_FIFOS$"].append(
+                """
+                wire [IN_WIDTH-1:0] bram_fifo_{id}_in;
+                wire [IN_WIDTH-1:0] bram_fifo_{id}_out;
+                swg_ram_buffer
+                #(
+                .WIDTH(IN_WIDTH),
+                .DEPTH({len}),
+                .RAM_STYLE("{ram_style}")
+                )
+                ram_buffer_inst_{id}
+                (
+                    .clk(clk),
+                    .rst_n(rst_n),
+                    .shift_enable(shift_enable),
+                    .shift_in(bram_fifo_{id}_in),
+                    .shift_out(bram_fifo_{id}_out)
+                );""".format(
+                    id=i,
+                    len=bram_fifo_depth,
+                    ram_style=self.get_nodeattr("ram_style"),
+                )
+            )
+
+        code_gen_dict["$GENERATE_OUTPUT_MAPPING$"] = []
+        out_idx = mmv_out - 1
+        for fifo_id, reg_fifo in enumerate(reg_fifos):
+            for fifo_idx, access_idx in enumerate(reg_fifo):
+                if access_idx != -1:
+                    code_gen_dict["$GENERATE_OUTPUT_MAPPING$"].append(
+                        """assign data_out[OUT_ELEM_WIDTH*{out_idx}+:OUT_ELEM_WIDTH]
+                        = reg_fifo_{fifo_id}[{access_idx}*{mmv}*OUT_ELEM_WIDTH+
+                        OUT_ELEM_WIDTH*{mmv_idx}+:OUT_ELEM_WIDTH];""".format(
+                            out_idx=out_idx,
+                            fifo_id=fifo_id,
+                            access_idx=len(reg_fifo) - 1 - int((max(reg_fifo) - access_idx) / M),
+                            mmv_idx=(max(reg_fifo) - access_idx) % M,
+                            mmv=M,
+                        )
+                    )
+                    # reversal: out_idx=0 -> oldest buffer element -> highest access_idx
+                    out_idx = out_idx - 1
+        assert out_idx == -1, "ERROR: Not all output vector elements connected"
+
+        code_gen_dict["$GENERATE_BUFFER_CONNECTION$"] = []
+        for i in range(len(reg_fifos)):
+            if i == 0:
+                # first FIFO containing newest elements -> input comes from input reg
+                code_gen_dict["$GENERATE_BUFFER_CONNECTION$"].append(
+                    """assign reg_fifo_{fifo_id}_in = data_in;""".format(
+                        fifo_id=i,
+                    )
+                )
+            else:
+                # other REG FIFOs -> input comes from connected BRAM FIFO (line buffer)
+                input_fifo_id = i - 1
+                code_gen_dict["$GENERATE_BUFFER_CONNECTION$"].append(
+                    """assign reg_fifo_{fifo_id}_in = bram_fifo_{input_fifo_id}_out;
+                    """.format(
+                        fifo_id=i, input_fifo_id=input_fifo_id
+                    )
+                )
+        for i in range(len(bram_fifos_depth)):
+            input_fifo_id = i
+            code_gen_dict["$GENERATE_BUFFER_CONNECTION$"].append(
+                """assign bram_fifo_{fifo_id}_in = reg_fifo_{input_fifo_id}_out;
+                """.format(
+                    fifo_id=i, input_fifo_id=input_fifo_id
+                )
+            )
+
+        return template_path, code_gen_dict
+
+    def select_impl_style(self):
+        """Selects implementation style based on folding configuration."""
+        simd = self.get_nodeattr("SIMD")
+        M = self.get_nodeattr("M")
+        ifm_ch = self.get_nodeattr("IFMChannels")
+        ifm_dim = self.get_nodeattr("IFMDim")
+        stride = self.get_nodeattr("Stride")
+        dilation = self.get_nodeattr("Dilation")
+        k = self.get_nodeattr("ConvKernelDim")
+        ifm_dim_h, ifm_dim_w = ifm_dim
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+        k_h, k_w = k
+        kernel_width = (k_w - 1) * dilation_w + 1  # incl. dilation
+        kernel_height = (k_h - 1) * dilation_h + 1  # incl. dilation
+
+        # check for valid configuration
+        assert (
+            kernel_height <= ifm_dim_h
+            and kernel_width <= ifm_dim_w
+            and stride_h <= ifm_dim_h
+            and stride_w <= ifm_dim_w
+        ), "Illegal conv configuration: kernel or stride > FM dimension"
+
+        # init folding config
+        if self.get_nodeattr("parallel_window"):
+            # mmv_in = M * 1
+            mmv_out = M * k_h * k_w
+            assert ifm_ch == simd, "Constraint violated: SIMD must be equal to IFMChannels"
+        else:
+            # mmv_in = 1
+            mmv_out = 1
+            assert ifm_ch % simd == 0, "Constraint violated: SIMD must divide IFMChannels"
+
+        # choose implementation style
+        if mmv_out > 1 or (k_h == 1 and k_w == 1):
+            impl_style = "parallel"
+            assert ifm_ch == simd, "Constraint violated: SIMD must be equal to IFMChannels"
+        else:
+            impl_style = "default"
+
+        return impl_style
+
+    def generate_hdl(self):
+        """Generates HDL code and wrapper for the IP, depending on required
+        implementation style."""
+        impl_style = self.select_impl_style()
+
+        # prepare code generation by filling out dictionaries
+        if impl_style == "default":
+            template_path, code_gen_dict = self.prepare_codegen_default()
+        elif impl_style == "parallel":
+            template_path, code_gen_dict = self.prepare_codegen_parallel()
+            if self.get_nodeattr("dynamic_mode"):
+                raise Exception("Dynamic mode is not compatible with parallel_window")
+        else:
+            raise Exception("Requested impl. style not implemented")
+
+        # add general parameters to dictionary
+        code_gen_dict["$TOP_MODULE_NAME$"] = [self.get_verilog_top_module_name()]
+        # save top module name so we can refer to it after this node has been renamed
+        # (e.g. by GiveUniqueNodeNames(prefix) during MakeZynqProject)
+        self.set_nodeattr("gen_top_module", self.get_verilog_top_module_name())
+        code_gen_dict["$BIT_WIDTH$"] = [str(self.get_input_datatype().bitwidth())]
+        ram_style = self.get_nodeattr("ram_style")
+        code_gen_dict["$RAM_STYLE$"] = ['"{}"'.format(ram_style)]
+
+        # apply code generation to templates
+        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        with open(template_path, "r") as f:
+            template = f.read()
+        if self.get_nodeattr("dynamic_mode"):
+            template_select = "/finn-rtllib/swg/swg_template_wrapper_dynamic.v"
+        else:
+            template_select = "/finn-rtllib/swg/swg_template_wrapper.v"
+        with open(os.environ["FINN_ROOT"] + template_select, "r") as f:
+            template_wrapper = f.read()
+        with open(os.environ["FINN_ROOT"] + "/finn-rtllib/swg/swg_template_axilite.v", "r") as f:
+            template_axilite = f.read()
+        for key in code_gen_dict:
+            # transform list into long string separated by '\n'
+            code_gen_line = "\n".join(code_gen_dict[key])
+            template = template.replace(key, code_gen_line)
+            template_wrapper = template_wrapper.replace(key, code_gen_line)
+            template_axilite = template_axilite.replace(key, code_gen_line)
+        with open(
+            os.path.join(code_gen_dir, self.get_nodeattr("gen_top_module") + "_impl.sv"),
+            "w",
+        ) as f:
+            f.write(template)
+        with open(
+            os.path.join(code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"),
+            "w",
+        ) as f:
+            f.write(template_wrapper)
+
+        # AXI-Lite reg. file component is only needed for dynamic mode
+        if self.get_nodeattr("dynamic_mode"):
+            with open(
+                os.path.join(code_gen_dir, self.get_nodeattr("gen_top_module") + "_axilite.v"),
+                "w",
+            ) as f:
+                f.write(template_axilite)
+
+        # Copy static source file for common core components
+        shutil.copy2(os.environ["FINN_ROOT"] + "/finn-rtllib/swg/swg_common.sv", code_gen_dir)
+        shutil.copy2(os.environ["FINN_ROOT"] + "/finn-rtllib/swg/swg_pkg.sv", code_gen_dir)
+
+        # set ipgen_path and ip_path so that HLS-Synth transformation
+        # and stich_ip transformation do not complain
+        self.set_nodeattr("ipgen_path", code_gen_dir)
+        self.set_nodeattr("ip_path", code_gen_dir)
+
+    def prepare_rtlsim(self):
+        """Creates a Verilator emulation library for the RTL code generated
+        for this node, sets the rtlsim_so attribute to its path and returns
+        a PyVerilator wrapper around it."""
+        # Modified to use generated (System-)Verilog instead of HLS output products
+
+        if PyVerilator is None:
+            raise ImportError("Installation of PyVerilator is required.")
+
+        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        verilog_paths = [code_gen_dir]
+        verilog_files = [
+            "swg_pkg.sv",
+            self.get_nodeattr("gen_top_module") + "_wrapper.v",
+            self.get_nodeattr("gen_top_module") + "_impl.sv",
+            "swg_common.sv",
+        ]
+        if self.get_nodeattr("dynamic_mode"):
+            verilog_files.append(self.get_nodeattr("gen_top_module") + "_axilite.v")
+
+        # build the Verilator emu library
+        sim = PyVerilator.build(
+            verilog_files,
+            build_dir=make_build_dir("pyverilator_" + self.onnx_node.name + "_"),
+            verilog_path=verilog_paths,
+            trace_depth=get_rtlsim_trace_depth(),
+            top_module_name=self.get_verilog_top_module_name(),
+        )
+        # save generated lib filename in attribute
+        self.set_nodeattr("rtlsim_so", sim.lib._name)
+        return sim
+
+    def code_generation_ipi(self):
+        """Constructs and returns the TCL for node instantiation in Vivado IPI."""
+        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+
+        sourcefiles = [
+            "swg_pkg.sv",
+            self.get_nodeattr("gen_top_module") + "_wrapper.v",
+            self.get_nodeattr("gen_top_module") + "_impl.sv",
+            "swg_common.sv",
+        ]
+
+        if self.get_nodeattr("dynamic_mode"):
+            sourcefiles += [self.get_nodeattr("gen_top_module") + "_axilite.v"]
+
+        sourcefiles = [os.path.join(code_gen_dir, f) for f in sourcefiles]
+
+        cmd = []
+        for f in sourcefiles:
+            cmd += ["add_files -norecurse %s" % (f)]
+        cmd += [
+            "create_bd_cell -type module -reference %s %s"
+            % (self.get_nodeattr("gen_top_module"), self.onnx_node.name)
+        ]
+        return cmd
+
+    def get_verilog_top_module_intf_names(self):
+        # Overload default HLSCustomOp implementation to add axilite control IF
+        """Return a dict of names of input and output interfaces.
+        The keys reflect the protocols each interface implements:
+        'clk', 'rst', 'm_axis', 's_axis', 'aximm', 'axilite'.
+        Values are lists of tuples (axis, aximm) or names (axilite):
+        'axis' tuples correspond to the list of node inputs in order,
+        each tuple is (interface_name, interface_width_bits).
+        axilite always assumed to be 32 bits and is not tuple (name only).
+        Each block must have at most one aximm and one axilite."""
+        intf_names = super().get_verilog_top_module_intf_names()
+        if self.get_nodeattr("dynamic_mode"):
+            intf_names["axilite"] = ["s_axilite"]
+        return intf_names
+
+    def get_dynamic_config(self, ifm_dim=None, stride=None, dilation=None):
+        """Returns a configuration dict to re-configure FM dimension during
+        runtime. Stride and dilation can also be changed. Certain restrictions
+        apply (e.g. component must be synthesized for largest buffer size)."""
+        # NOTE: For better driver integration, this functionality could be packaged
+        # as a standalone function in the future
+        if self.select_impl_style() != "default":
+            raise Exception("Impl. style is incompatible with dynamic mode")
+
+        if ifm_dim is None:
+            ifm_dim = self.get_nodeattr("IFMDim")
+        k = self.get_nodeattr("ConvKernelDim")
+        if stride is None:
+            stride = self.get_nodeattr("Stride")
+        if dilation is None:
+            dilation = self.get_nodeattr("Dilation")
+
+        k_h, k_w = k
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+        ifm_dim_h, ifm_dim_w = ifm_dim
+        ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h)
+        ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w)
+        ofm_dim = [ofm_dim_h, ofm_dim_w]
+
+        # update attributes and perform sanity check
+        original_buffer_depth = self.get_buffer_depth()
+        self.set_nodeattr("IFMDim", ifm_dim)
+        self.set_nodeattr("OFMDim", ofm_dim)
+        self.set_nodeattr("Stride", stride)
+        self.set_nodeattr("Dilation", dilation)
+        assert (
+            self.get_buffer_depth() <= original_buffer_depth
+        ), """Error: requested
+            dynamic configuration does not fit in generated buffer implementation."""
+
+        # (re-)call codegen and extract new values
+        # each setting is mapped to an axi-lite register address
+        template_path, code_gen_dict = self.prepare_codegen_default()
+        config = {
+            "cfg_wren": (0 * 4, 1),
+            "cfg_cntr_simd": (1 * 4, int(code_gen_dict["$LOOP_SIMD_ITERATIONS$"][0])),
+            "cfg_cntr_kw": (2 * 4, int(code_gen_dict["$LOOP_KW_ITERATIONS$"][0])),
+            "cfg_cntr_kh": (3 * 4, int(code_gen_dict["$LOOP_KH_ITERATIONS$"][0])),
+            "cfg_cntr_w": (4 * 4, int(code_gen_dict["$LOOP_W_ITERATIONS$"][0])),
+            "cfg_cntr_h": (5 * 4, int(code_gen_dict["$LOOP_H_ITERATIONS$"][0])),
+            "cfg_incr_head_simd": (6 * 4, int(code_gen_dict["$HEAD_INCR_SIMD$"][0])),
+            "cfg_incr_head_kw": (7 * 4, int(code_gen_dict["$HEAD_INCR_KW$"][0])),
+            "cfg_incr_head_kh": (8 * 4, int(code_gen_dict["$HEAD_INCR_KH$"][0])),
+            "cfg_incr_head_w": (9 * 4, int(code_gen_dict["$HEAD_INCR_W$"][0])),
+            "cfg_incr_head_h": (10 * 4, int(code_gen_dict["$HEAD_INCR_H$"][0])),
+            "cfg_incr_tail_w": (11 * 4, int(code_gen_dict["$TAIL_INCR_W$"][0])),
+            "cfg_incr_tail_h": (12 * 4, int(code_gen_dict["$TAIL_INCR_H$"][0])),
+            "cfg_incr_tail_last": (13 * 4, int(code_gen_dict["$TAIL_INCR_LAST$"][0])),
+            "cfg_last_read": (14 * 4, int(code_gen_dict["$LAST_READ_ELEM$"][0])),
+            "cfg_last_write": (15 * 4, int(code_gen_dict["$LAST_WRITE_ELEM$"][0])),
+        }
+        return config
+
+    def code_generation_ipgen(self, model, fpgapart, clk):
+        """Generates (System-)Verilog code for IP generation (instead of HLS code)."""
+        self.generate_hdl()
+
+    def ipgen_singlenode_code(self):
+        """Not implemented (RTL component)."""
+        pass
+
+    def code_generation_cppsim(self, model):
+        """Not implemented (RTL component)."""
+        pass
+
+    def compile_singlenode_code(self):
+        """Not implemented (RTL component)."""
+        pass
+
+    def global_includes(self):
+        """Not implemented (RTL component)."""
+        pass
+
+    def defines(self, var):
+        """Not implemented (RTL component)."""
+        pass
+
+    def read_npy_data(self):
+        """Not implemented (RTL component)."""
+        pass
+
+    def strm_decl(self):
+        """Not implemented (RTL component)."""
+        pass
+
+    def docompute(self):
+        """Not implemented (RTL component)."""
+        pass
+
+    def dataoutstrm(self):
+        """Not implemented (RTL component)."""
+        pass
+
+    def save_as_npy(self):
+        """Not implemented (RTL component)."""
+        pass
+
+    def blackboxfunction(self):
+        """Not implemented (RTL component)."""
+        pass
+
+    def pragmas(self):
+        """Not implemented (RTL component)."""
+        pass
diff --git a/src/finn/custom_op/fpgadataflow/downsampler.py b/src/finn/custom_op/fpgadataflow/downsampler.py
index da29a524b6..e2cea6da6b 100644
--- a/src/finn/custom_op/fpgadataflow/downsampler.py
+++ b/src/finn/custom_op/fpgadataflow/downsampler.py
@@ -36,11 +36,11 @@
 
 
 class DownSampler(HLSCustomOp):
-    """Corresponds to finn-hlslib ConvolutionInputGenerator_kernel1 function.
+    """Corresponds to finn-hlslib ConvolutionInputGenerator_*_kernel1 function.
     Basically performs a down sampling of the image removing rows and columns."""
 
-    def __init__(self, onnx_node):
-        super().__init__(onnx_node)
+    def __init__(self, onnx_node, **kwargs):
+        super().__init__(onnx_node, **kwargs)
 
     def get_nodeattr_types(self):
         my_attrs = {
@@ -55,6 +55,10 @@ def get_nodeattr_types(self):
             "inputDataType": ("s", True, ""),
             # Batch size
             "numInputVectors": ("i", False, 1),
+            # 1D (True) or 2D (False) spatial data
+            "is1D": ("i", False, 0),
+            # for 1D only: (D, 1) (True) or (1, D) dims
+            "is1D_unitx": ("i", False, 1),
         }
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs
@@ -66,28 +70,46 @@ def get_downsampled_odim(self):
         return int(np.floor((idim - 1) / stride) + 1)
 
     def get_exp_cycles(self):
+        is_1D = self.get_nodeattr("is1D")
         idim = self.get_nodeattr("ImgDim")
+        idim_total = idim if is_1D else idim * idim
         channels = self.get_nodeattr("NumChannels")
         simd = self.get_nodeattr("SIMD")
         batch_size = self.get_nodeattr("numInputVectors")
-        exp_cycles = channels / simd * batch_size * idim * idim
+        exp_cycles = channels / simd * batch_size * idim_total
         return int(exp_cycles)
 
-    def get_normal_input_shape(self):
+    def get_normal_input_shape(self, ind=0):
+        is_1D = self.get_nodeattr("is1D")
+        is_1D_unitx = self.get_nodeattr("is1D_unitx")
         idim = self.get_nodeattr("ImgDim")
         num_ch = self.get_nodeattr("NumChannels")
         batch = self.get_nodeattr("numInputVectors")
-        ishape = (batch, idim, idim, num_ch)
+        if is_1D:
+            if is_1D_unitx:
+                ishape = (batch, idim, 1, num_ch)
+            else:
+                ishape = (batch, 1, idim, num_ch)
+        else:
+            ishape = (batch, idim, idim, num_ch)
         return ishape
 
-    def get_normal_output_shape(self):
+    def get_normal_output_shape(self, ind=0):
+        is_1D = self.get_nodeattr("is1D")
+        is_1D_unitx = self.get_nodeattr("is1D_unitx")
         odim = self.get_downsampled_odim()
         num_ch = self.get_nodeattr("NumChannels")
         batch = self.get_nodeattr("numInputVectors")
-        oshape = (batch, odim, odim, num_ch)
+        if is_1D:
+            if is_1D_unitx:
+                oshape = (batch, odim, 1, num_ch)
+            else:
+                oshape = (batch, 1, odim, num_ch)
+        else:
+            oshape = (batch, odim, odim, num_ch)
         return oshape
 
-    def get_folded_input_shape(self):
+    def get_folded_input_shape(self, ind=0):
         normal_ishape = list(self.get_normal_input_shape())
         ifm_ch = self.get_nodeattr("NumChannels")
         simd = self.get_nodeattr("SIMD")
@@ -96,7 +118,7 @@ def get_folded_input_shape(self):
         folded_ishape = normal_ishape[:-1] + [fold, simd]
         return tuple(folded_ishape)
 
-    def get_folded_output_shape(self):
+    def get_folded_output_shape(self, ind=0):
         normal_oshape = list(self.get_normal_output_shape())
         ifm_ch = self.get_nodeattr("NumChannels")
         simd = self.get_nodeattr("SIMD")
@@ -129,21 +151,21 @@ def infer_node_datatype(self, model):
     def verify_node(self):
         pass
 
-    def get_input_datatype(self):
+    def get_input_datatype(self, ind=0):
         """Returns FINN DataType of input."""
         ret = DataType[self.get_nodeattr("inputDataType")]
         return ret
 
-    def get_output_datatype(self):
+    def get_output_datatype(self, ind=0):
         """Returns FINN DataType of output. (Same as input datatype)"""
         return self.get_input_datatype()
 
-    def get_instream_width(self):
+    def get_instream_width(self, ind=0):
         ibits = self.get_input_datatype().bitwidth()
         simd = self.get_nodeattr("SIMD")
         return ibits * simd
 
-    def get_outstream_width(self):
+    def get_outstream_width(self, ind=0):
         obits = self.get_output_datatype().bitwidth()
         simd = self.get_nodeattr("SIMD")
         return obits * simd
@@ -190,23 +212,36 @@ def read_npy_data(self):
         npy_in = "%s/input_0.npy" % code_gen_dir
         self.code_gen_dict["$READNPYDATA$"] = []
         self.code_gen_dict["$READNPYDATA$"].append(
-            'npy2apintstream<%s, %s, %d, %s>("%s", in0);'
-            % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in)
+            'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s);'
+            % (
+                packed_hls_type,
+                elem_hls_type,
+                elem_bits,
+                npy_type,
+                npy_in,
+                self.hls_sname(),
+            )
         )
 
     def strm_decl(self):
         self.code_gen_dict["$STREAMDECLARATIONS$"] = []
         self.code_gen_dict["$STREAMDECLARATIONS$"].append(
-            'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.get_instream_width())
+            'hls::stream<ap_uint<{}>> in0_{} ("in0_{}");'.format(
+                self.get_instream_width(), self.hls_sname(), self.hls_sname()
+            )
         )
         self.code_gen_dict["$STREAMDECLARATIONS$"].append(
-            'hls::stream<ap_uint<{}>> out ("out");'.format(self.get_outstream_width())
+            'hls::stream<ap_uint<{}>> out_{} ("out_{}");'.format(
+                self.get_outstream_width(), self.hls_sname(), self.hls_sname()
+            )
         )
 
     def docompute(self):
+        dim_var = "1D" if (self.get_nodeattr("is1D") == 1) else "2D"
+        sname = self.hls_sname()
         self.code_gen_dict["$DOCOMPUTE$"] = [
-            """ConvolutionInputGenerator_kernel1<IFMChannels, Input_precision,
-            IFMDim, SIMD,Stride> (in0, out, numReps);"""
+            f"""ConvolutionInputGenerator_{dim_var}_kernel1<IFMChannels, Input_precision,
+            IFMDim, SIMD,Stride> (in0_{sname}, out_{sname}, numReps);"""
         ]
 
     def dataoutstrm(self):
@@ -225,12 +260,13 @@ def dataoutstrm(self):
         oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}")
 
         self.code_gen_dict["$DATAOUTSTREAM$"] = [
-            'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s");'
+            'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s");'
             % (
                 packed_hls_type,
                 elem_hls_type,
                 elem_bits,
                 npy_type,
+                self.hls_sname(),
                 oshape_cpp_str,
                 npy_out,
             )
@@ -243,20 +279,24 @@ def blackboxfunction(self):
         packed_bits = self.get_instream_width()
         packed_hls_type = "ap_uint<%d>" % packed_bits
         self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
-            "void %s(hls::stream<%s > &in0, hls::stream<%s > &out)"
-            % (self.onnx_node.name, packed_hls_type, packed_hls_type)
+            "void %s(hls::stream<%s > &in0_%s, hls::stream<%s > &out_%s)"
+            % (
+                self.onnx_node.name,
+                packed_hls_type,
+                self.hls_sname(),
+                packed_hls_type,
+                self.hls_sname(),
+            )
         ]
 
     def pragmas(self):
         self.code_gen_dict["$PRAGMAS$"] = [
-            "#pragma HLS INTERFACE axis port=in0 name=in0_" + self.hls_sname()
+            "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname()
         ]
         self.code_gen_dict["$PRAGMAS$"].append(
-            "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname()
-        )
-        self.code_gen_dict["$PRAGMAS$"].append(
-            "#pragma HLS INTERFACE ap_ctrl_none port=return"
+            "#pragma HLS INTERFACE axis port=out_" + self.hls_sname()
         )
+        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return")
 
     def execute_node(self, context, graph):
         mode = self.get_nodeattr("exec_mode")
diff --git a/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py b/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py
index 04ca45e7f1..1f2d1b79be 100644
--- a/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py
+++ b/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py
@@ -38,8 +38,8 @@
 class DuplicateStreams_Batch(HLSCustomOp):
     """Class that corresponds to finn-hlslib function of the same name."""
 
-    def __init__(self, onnx_node):
-        super().__init__(onnx_node)
+    def __init__(self, onnx_node, **kwargs):
+        super().__init__(onnx_node, **kwargs)
 
     def get_nodeattr_types(self):
         my_attrs = {
@@ -61,13 +61,13 @@ def get_nodeattr_types(self):
     def get_num_output_streams(self):
         return self.get_nodeattr("NumOutputStreams")
 
-    def get_normal_input_shape(self):
+    def get_normal_input_shape(self, ind=0):
         ch = self.get_nodeattr("NumChannels")
         vecs = list(self.get_nodeattr("numInputVectors"))
         ishape = tuple(vecs + [ch])
         return ishape
 
-    def get_folded_input_shape(self):
+    def get_folded_input_shape(self, ind=0):
         ch = self.get_nodeattr("NumChannels")
         pe = self.get_nodeattr("PE")
         vecs = list(self.get_nodeattr("numInputVectors"))
@@ -132,28 +132,26 @@ def verify_node(self):
             self.get_nodeattr("inputDataType")
             info_messages.append("All necessary attributes exist")
         except Exception:
-            info_messages.append(
-                """The required GlobalAccPool_Batch attributes do not exist."""
-            )
+            info_messages.append("""The required GlobalAccPool_Batch attributes do not exist.""")
 
         return info_messages
 
-    def get_input_datatype(self):
+    def get_input_datatype(self, ind=0):
         """Returns FINN DataType of input."""
         return DataType[self.get_nodeattr("inputDataType")]
 
-    def get_output_datatype(self):
+    def get_output_datatype(self, ind=0):
         """Returns FINN DataType of output."""
         return DataType[self.get_nodeattr("inputDataType")]
 
-    def get_instream_width(self):
+    def get_instream_width(self, ind=0):
         """Returns input stream width."""
         ibits = self.get_input_datatype().bitwidth()
         pe = self.get_nodeattr("PE")
         in_width = pe * ibits
         return in_width
 
-    def get_outstream_width(self):
+    def get_outstream_width(self, ind=0):
         """Returns output stream width."""
         obits = self.get_output_datatype().bitwidth()
         pe = self.get_nodeattr("PE")
@@ -161,9 +159,7 @@ def get_outstream_width(self):
         return out_width
 
     def get_number_output_values(self):
-        return self.get_num_output_streams() * np.prod(
-            self.get_folded_output_shape()[1:-1]
-        )
+        return self.get_num_output_streams() * np.prod(self.get_folded_output_shape()[1:-1])
 
     def get_exp_cycles(self):
         # Channels/PE * batch size * fmdim * fmdim
@@ -235,9 +231,7 @@ def execute_node(self, context, graph):
             # execute the precompiled model
             super().exec_precompiled_singlenode_model()
             # load output npy file
-            super().npy_to_dynamic_outputs(
-                context, ["output%d.npy" % i for i in range(n_outputs)]
-            )
+            super().npy_to_dynamic_outputs(context, ["output%d.npy" % i for i in range(n_outputs)])
             for i in range(n_outputs):
                 assert (
                     context[node.output[i]].shape == exp_oshape
@@ -309,18 +303,27 @@ def read_npy_data(self):
         npy_in = "%s/input_0.npy" % code_gen_dir
         self.code_gen_dict["$READNPYDATA$"] = []
         self.code_gen_dict["$READNPYDATA$"].append(
-            'npy2apintstream<%s, %s, %d, %s>("%s", in0);'
-            % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in)
+            'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s);'
+            % (
+                packed_hls_type,
+                elem_hls_type,
+                elem_bits,
+                npy_type,
+                npy_in,
+                self.hls_sname(),
+            )
         )
 
     def strm_decl(self):
         n_outputs = self.get_num_output_streams()
         self.code_gen_dict["$STREAMDECLARATIONS$"] = []
         self.code_gen_dict["$STREAMDECLARATIONS$"].append(
-            'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.get_instream_width())
+            'hls::stream<ap_uint<{}>> in0_{} ("in0_{}");'.format(
+                self.get_instream_width(), self.hls_sname(), self.hls_sname()
+            )
         )
         for i in range(n_outputs):
-            out_name = "out%d" % i
+            out_name = "out%d_%s" % (i, self.hls_sname())
             self.code_gen_dict["$STREAMDECLARATIONS$"].append(
                 'hls::stream<ap_uint<%d>> %s ("%s");'
                 % (self.get_outstream_width(), out_name, out_name)
@@ -328,8 +331,13 @@ def strm_decl(self):
 
     def docompute(self):
         n_outputs = self.get_num_output_streams()
-        ostreams = ["out%d" % x for x in range(n_outputs)]
-        dc = "DuplicateStreamsCustom(in0, %s);" % (",".join(ostreams))
+        ostreams = []
+        for i in range(n_outputs):
+            ostreams.append("out%d_%s" % (i, self.hls_sname()))
+        dc = "DuplicateStreamsCustom(in0_%s, %s);" % (
+            self.hls_sname(),
+            ",".join(ostreams),
+        )
         self.code_gen_dict["$DOCOMPUTE$"] = [dc]
 
     def dataoutstrm(self):
@@ -346,7 +354,7 @@ def dataoutstrm(self):
         outstrm_code = []
 
         for i in range(n_outputs):
-            out_name = "out%d" % i
+            out_name = "out%d_%s" % (i, self.hls_sname())
             npy_out = "%s/output%d.npy" % (code_gen_dir, i)
             outstrm_code.append(
                 'apintstream2npy<%s, %s, %d, %s>(%s, %s, "%s");'
@@ -371,10 +379,14 @@ def blackboxfunction(self):
         inp_streams = []
         o_stream_w = self.get_outstream_width()
         i_stream_w = self.get_instream_width()
-        in_stream = "hls::stream<ap_uint<%d> > &in0" % (i_stream_w)
+        in_stream = "hls::stream<ap_uint<%d> > &in0_%s" % (i_stream_w, self.hls_sname())
         inp_streams.append(in_stream)
         for i in range(n_outputs):
-            out_stream = "hls::stream<ap_uint<%d> > &out%d" % (o_stream_w, i)
+            out_stream = "hls::stream<ap_uint<%d> > &out%d_%s" % (
+                o_stream_w,
+                i,
+                self.hls_sname(),
+            )
             inp_streams.append(out_stream)
 
         self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
@@ -387,16 +399,13 @@ def blackboxfunction(self):
     def pragmas(self):
         n_outputs = self.get_num_output_streams()
         self.code_gen_dict["$PRAGMAS$"] = [
-            "#pragma HLS INTERFACE axis port=in0 name=in0_" + self.hls_sname()
+            "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname()
         ]
         for i in range(n_outputs):
             self.code_gen_dict["$PRAGMAS$"].append(
-                "#pragma HLS INTERFACE axis port=out%d name=out%d_%s"
-                % (i, i, self.hls_sname())
+                "#pragma HLS INTERFACE axis port=out%d_%s" % (i, self.hls_sname())
             )
-        self.code_gen_dict["$PRAGMAS$"].append(
-            "#pragma HLS INTERFACE ap_ctrl_none port=return"
-        )
+        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return")
 
     def get_verilog_top_module_intf_names(self):
         intf_names = super().get_verilog_top_module_intf_names()
@@ -408,3 +417,13 @@ def get_verilog_top_module_intf_names(self):
                 ("out%d_%s" % (i, sname), self.get_outstream_width_padded())
             )
         return intf_names
+
+    def derive_characteristic_fxns(self, period):
+        n_inps = np.prod(self.get_folded_input_shape()[:-1])
+        io_dict = {
+            "inputs": {
+                "in0": [0 for i in range(n_inps)],
+            },
+            "outputs": {"out0": [], "out1": []},
+        }
+        super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict)
diff --git a/src/finn/custom_op/fpgadataflow/eltwise.py b/src/finn/custom_op/fpgadataflow/eltwise.py
new file mode 100644
index 0000000000..ab1dc00118
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/eltwise.py
@@ -0,0 +1,484 @@
+# Copyright (c) 2022, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import numpy as np
+import os
+import warnings
+from qonnx.core.datatype import DataType
+
+from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
+from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
+
+
+class StreamingEltwise(HLSCustomOp):
+    """Class that corresponds to finn-hlslib StreamingEltwise function."""
+
+    def __init__(self, onnx_node, **kwargs):
+        super().__init__(onnx_node, **kwargs)
+
+    def get_nodeattr_types(self):
+        my_attrs = super().get_nodeattr_types()
+        my_attrs.update(
+            {
+                "NumChannels": ("i", True, ""),
+                "PE": ("i", True, ""),
+                # FINN DataTypes for inputs; output datatype inferred from input
+                "inputDataType0": ("s", True, ""),
+                "inputDataType1": ("s", True, ""),
+                # type of EltwiseFunction for the operation
+                "eltwiseOp": ("s", True, "", ["Add", "Sub", "AbsDiff"]),
+                # number of input vectors, examples:
+                # [1] is a single vector (like a FC layer with batch=1)
+                # [4] is four vectors (like a FC layer with batch=4)
+                # [1, 4, 4] is four * four vectors (like a conv layer with batch=1)
+                "numInputVectors": ("ints", False, [1]),
+                "inFIFODepths": ("ints", False, [2, 2]),
+            }
+        )
+        return my_attrs
+
+    def get_eltwise_op_lambda(self):
+        eltwise_op = self.get_nodeattr("eltwiseOp")
+        idt0 = self.get_input_datatype(0)
+        idt1 = self.get_input_datatype(1)
+        odt = self.get_output_datatype()
+        tin0 = idt0.get_hls_datatype_str()
+        tin1 = idt1.get_hls_datatype_str()
+        tout = odt.get_hls_datatype_str()
+        eltwise_ops = {
+            # "Add": "[](auto a, auto b) { return  a + b; }",
+            # "Sub": "[](auto a, auto b) { return  a - b; }",
+            # "AbsDiff": "[](auto a, auto b) { return  a>b? a-b : b-a; }",
+            "Add": f"add<{tin0}, {tin1}, {tout}>()",
+            "Sub": f"sub<{tin0}, {tin1}, {tout}>()",
+            "AbsDiff": f"absdiff<{tin0}, {tin1}, {tout}>()",
+        }
+        return eltwise_ops[eltwise_op]
+
+    def get_normal_input_shape(self, ind=0):
+        ich = self.get_nodeattr("NumChannels")
+        vecs = list(self.get_nodeattr("numInputVectors"))
+        ishape = tuple(vecs + [ich])
+        return ishape
+
+    def get_folded_input_shape(self, ind=0):
+        ich = self.get_nodeattr("NumChannels")
+        pe = self.get_nodeattr("PE")
+        assert ich % pe == 0, "PE must divide NumChannels"
+        vecs = list(self.get_nodeattr("numInputVectors"))
+        ishape = tuple(vecs + [ich // pe, pe])
+        return ishape
+
+    def get_normal_output_shape(self, ind=0):
+        return self.get_normal_input_shape()
+
+    def get_folded_output_shape(self, ind=0):
+        return self.get_folded_input_shape()
+
+    def make_shape_compatible_op(self, model):
+        exp_ishape = self.get_normal_input_shape()
+        oshape = self.get_normal_output_shape()
+        ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0]))
+        assert ishape == exp_ishape, "Unexpected input1 shape."
+        ishape = tuple(model.get_tensor_shape(self.onnx_node.input[1]))
+        assert ishape == exp_ishape, "Unexpected input2 shape."
+        return super().make_const_shape_op(oshape)
+
+    def infer_node_datatype(self, model):
+        node = self.onnx_node
+        idt0 = model.get_tensor_datatype(node.input[0])
+        if idt0 != self.get_input_datatype(0):
+            warn_str = "inputDataType0 changing for %s: %s -> %s " % (
+                node.name,
+                str(self.get_input_datatype(0)),
+                str(idt0),
+            )
+            warnings.warn(warn_str)
+        self.set_nodeattr("inputDataType0", idt0.name)
+        idt1 = model.get_tensor_datatype(node.input[1])
+        if idt1 != self.get_input_datatype(1):
+            warn_str = "inputDataType1 changing for %s: %s -> %s " % (
+                node.name,
+                str(self.get_input_datatype(1)),
+                str(idt1),
+            )
+            warnings.warn(warn_str)
+        self.set_nodeattr("inputDataType1", idt1.name)
+        # enforce output data type (calculated based on idt)
+        odt = self.get_output_datatype()
+        model.set_tensor_datatype(self.onnx_node.output[0], odt)
+
+    def verify_node(self):
+        info_messages = []
+        # verify that "backend" is set to "fpgadataflow"
+        backend_value = self.get_nodeattr("backend")
+        if backend_value == "fpgadataflow":
+            info_messages.append("Attribute backend is set correctly")
+        else:
+            info_messages.append('Attribute backend should be set to "fpgadataflow"')
+
+        # verify that all necessary attributes exist
+        try:
+            self.get_nodeattr("code_gen_dir_cppsim")
+            self.get_nodeattr("executable_path")
+            self.get_nodeattr("NumChannels")
+            self.get_nodeattr("PE")
+            self.get_nodeattr("inputDataType0")
+            self.get_nodeattr("inputDataType1")
+            self.get_nodeattr("eltwiseOp")
+            info_messages.append("All necessary attributes exist")
+        except Exception:
+            info_messages.append("""The required StreamingEltwise attributes do not exist.""")
+
+        return info_messages
+
+    def get_input_datatype(self, ind=0):
+        """Returns FINN DataType of input."""
+        return DataType[self.get_nodeattr("inputDataType" + str(ind))]
+
+    def get_output_datatype(self, ind=0):
+        """Returns FINN DataType of output."""
+        op = self.get_nodeattr("eltwiseOp")
+        idt0 = self.get_input_datatype(0)
+        idt1 = self.get_input_datatype(1)
+        assert idt0.signed() == idt1.signed(), (
+            "%s: Inputs must have same signedness" % self.onnx_node.name
+        )
+        idt0_min, idt0_max = idt0.min(), idt0.max()
+        idt1_min, idt1_max = idt1.min(), idt1.max()
+        cands = [
+            idt0_min - idt1_min,
+            idt0_min - idt1_max,
+            idt0_max - idt1_min,
+            idt0_max - idt1_max,
+        ]
+        largest_magnitude = max(map(abs, cands))
+        if op == "Add":
+            if idt0.signed():
+                return DataType.get_smallest_possible(idt0.min() + idt1.min())
+            else:
+                return DataType.get_smallest_possible(idt0.max() + idt1.max())
+        elif op == "Sub":
+            return DataType.get_smallest_possible(-largest_magnitude)
+        elif op == "AbsDiff":
+            return DataType.get_smallest_possible(largest_magnitude)
+        else:
+            raise Exception("%s: Unknown eltWiseOp = %s" % (self.onnx_node.name, op))
+
+    def get_instream_width(self, ind=0):
+        """Returns input stream width."""
+        ibits = self.get_input_datatype(ind).bitwidth()
+        pe = self.get_nodeattr("PE")
+        in_width = pe * ibits
+        return in_width
+
+    def get_outstream_width(self, ind=0):
+        """Returns output stream width."""
+        obits = self.get_output_datatype().bitwidth()
+        pe = self.get_nodeattr("PE")
+        out_width = pe * obits
+        return out_width
+
+    def get_number_output_values(self):
+        return np.prod(self.get_folded_output_shape()[:-1])
+
+    def get_exp_cycles(self):
+        # Channels/PE * batch size * fmdim * fmdim
+        return np.prod(self.get_folded_output_shape()[:-1])
+
+    def execute_node(self, context, graph):
+        mode = self.get_nodeattr("exec_mode")
+        node = self.onnx_node
+        exp_ishape = self.get_normal_input_shape()
+        exp_oshape = self.get_normal_output_shape()
+        folded_ishape = self.get_folded_input_shape()
+
+        if mode == "cppsim":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        elif mode == "rtlsim":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+                    mode
+                )
+            )
+
+        inp = context[node.input[0]]
+        assert str(inp.dtype) == "float32", "Input datatype is not float32"
+        assert inp.shape == exp_ishape, """Input0 shape doesn't match expected shape ."""
+        export_idt0 = self.get_input_datatype(0)
+        # reshape input into folded form
+        inp = inp.reshape(folded_ishape)
+        # make copy before saving array
+        reshaped_input = inp.copy()
+        np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input)
+
+        # exact same thing for input1
+        inp = context[node.input[1]]
+        assert str(inp.dtype) == "float32", "Input datatype is not float32"
+        assert inp.shape == exp_ishape, """Input1 shape doesn't match expected shape ."""
+        export_idt1 = self.get_input_datatype(1)
+        # reshape input into folded form
+        inp = inp.reshape(folded_ishape)
+        # make copy before saving array
+        reshaped_input = inp.copy()
+        np.save(os.path.join(code_gen_dir, "input_1.npy"), reshaped_input)
+
+        if mode == "cppsim":
+            # execute the precompiled model
+            super().exec_precompiled_singlenode_model()
+            # load output npy file
+            super().npy_to_dynamic_output(context)
+            assert (
+                context[node.output[0]].shape == exp_oshape
+            ), "cppsim did not produce expected output shape"
+        elif mode == "rtlsim":
+            sim = self.get_rtlsim()
+            nbits0 = self.get_instream_width(0)
+            nbits1 = self.get_instream_width(1)
+            rtlsim_inp0 = npy_to_rtlsim_input(
+                "{}/input_0.npy".format(code_gen_dir), export_idt0, nbits0
+            )
+            rtlsim_inp1 = npy_to_rtlsim_input(
+                "{}/input_1.npy".format(code_gen_dir), export_idt1, nbits1
+            )
+            super().reset_rtlsim(sim)
+            super().toggle_clk(sim)
+            rtlsim_output = self.rtlsim(sim, rtlsim_inp0, rtlsim_inp1)
+            odt = self.get_output_datatype()
+            target_bits = odt.bitwidth()
+            packed_bits = self.get_outstream_width()
+            out_npy_path = "{}/output.npy".format(code_gen_dir)
+            out_shape = self.get_folded_output_shape()
+            rtlsim_output_to_npy(
+                rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits
+            )
+            # load and reshape output
+            output = np.load(out_npy_path)
+            output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape)
+            context[node.output[0]] = output
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+                    mode
+                )
+            )
+
+        assert (
+            context[node.output[0]].shape == exp_oshape
+        ), """Output shape doesn't match expected shape."""
+
+    def global_includes(self):
+        self.code_gen_dict["$GLOBALS$"] = [
+            '#include "eltwise.hpp"',
+            '#include "interpret.hpp"',
+        ]
+
+        self.code_gen_dict["$GLOBALS$"].extend(
+            [
+                "template<typename TI1, typename TI2, typename TO>",
+                "struct absdiff {",
+                "TO operator()(TI1 const &a, TI2 const &b) const {",
+                "#pragma HLS inline",
+                "return  a>b? a-b : b-a;",
+                "}",
+                "};",
+                "template<typename TI1, typename TI2, typename TO>",
+                "struct sub {",
+                "TO operator()(TI1 const &a, TI2 const &b) const {",
+                "#pragma HLS inline",
+                "return  a-b;",
+                "}",
+                "};",
+                "template<typename TI1, typename TI2, typename TO>",
+                "struct add {",
+                "TO operator()(TI1 const &a, TI2 const &b) const {",
+                "#pragma HLS inline",
+                "return  a+b;",
+                "}",
+                "};",
+            ]
+        )
+
+    def defines(self, var):
+        self.code_gen_dict["$DEFINES$"] = []
+
+    def read_npy_data(self):
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        idt0 = self.get_input_datatype(0)
+        idt1 = self.get_input_datatype(1)
+        elem_bits_0 = idt0.bitwidth()
+        elem_bits_1 = idt1.bitwidth()
+        packed_bits_0 = self.get_instream_width(0)
+        packed_hls_type_0 = "ap_uint<%d>" % packed_bits_0
+        packed_bits_1 = self.get_instream_width(1)
+        packed_hls_type_1 = "ap_uint<%d>" % packed_bits_1
+        elem_hls_type_0 = idt0.get_hls_datatype_str()
+        elem_hls_type_1 = idt1.get_hls_datatype_str()
+        npy_type = "float"
+        self.code_gen_dict["$READNPYDATA$"] = []
+        npy_in = "%s/input_0.npy" % code_gen_dir
+        self.code_gen_dict["$READNPYDATA$"].append(
+            'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s);'
+            % (
+                packed_hls_type_0,
+                elem_hls_type_0,
+                elem_bits_0,
+                npy_type,
+                npy_in,
+                self.hls_sname(),
+            )
+        )
+        npy_in = "%s/input_1.npy" % code_gen_dir
+        self.code_gen_dict["$READNPYDATA$"].append(
+            'npy2apintstream<%s, %s, %d, %s>("%s", in1_%s);'
+            % (
+                packed_hls_type_1,
+                elem_hls_type_1,
+                elem_bits_1,
+                npy_type,
+                npy_in,
+                self.hls_sname(),
+            )
+        )
+
+    def strm_decl(self):
+        self.code_gen_dict["$STREAMDECLARATIONS$"] = []
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+            'hls::stream<ap_uint<{}>> in0_{} ("in0_{}");'.format(
+                self.get_instream_width(0), self.hls_sname(), self.hls_sname()
+            )
+        )
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+            'hls::stream<ap_uint<{}>> in1_{} ("in1_{}");'.format(
+                self.get_instream_width(1), self.hls_sname(), self.hls_sname()
+            )
+        )
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+            'hls::stream<ap_uint<{}>> out_{} ("out_{}");'.format(
+                self.get_outstream_width(), self.hls_sname(), self.hls_sname()
+            )
+        )
+
+    def docompute(self):
+        op = self.get_nodeattr("eltwiseOp")
+        idt0 = self.get_input_datatype(0)
+        idt1 = self.get_input_datatype(1)
+        odt = self.get_output_datatype()
+        elem_hls_type_0 = idt0.get_hls_datatype_str()
+        elem_hls_type_1 = idt1.get_hls_datatype_str()
+        out_hls_type = odt.get_hls_datatype_str()
+        slice_in0 = "Slice<%s>" % elem_hls_type_0
+        slice_in1 = "Slice<%s>" % elem_hls_type_1
+        slice_out = "Slice<%s>" % out_hls_type
+        eltwise_op_str = self.get_eltwise_op_lambda()
+        "%sEltwiseFunction<%s, %s, %s>()" % (
+            op,
+            elem_hls_type_0,
+            elem_hls_type_1,
+            out_hls_type,
+        )
+        self.code_gen_dict["$DOCOMPUTE$"] = [
+            """{}<{}, {}, {}, {}, {}, {}>(in0_{}, in1_{}, out_{}, {});""".format(
+                "StreamingEltwise",
+                self.get_nodeattr("NumChannels"),
+                self.get_nodeattr("PE"),
+                int(np.prod(self.get_folded_output_shape()[:-2])),
+                slice_in0,
+                slice_in1,
+                slice_out,
+                self.hls_sname(),
+                self.hls_sname(),
+                self.hls_sname(),
+                eltwise_op_str,
+            )
+        ]
+
+    def dataoutstrm(self):
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        dtype = self.get_output_datatype()
+        elem_bits = dtype.bitwidth()
+        packed_bits = self.get_outstream_width()
+        packed_hls_type = "ap_uint<%d>" % packed_bits
+        elem_hls_type = dtype.get_hls_datatype_str()
+        npy_type = "float"
+        npy_out = "%s/output.npy" % code_gen_dir
+        oshape = self.get_folded_output_shape()
+        oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}")
+
+        self.code_gen_dict["$DATAOUTSTREAM$"] = [
+            'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s");'
+            % (
+                packed_hls_type,
+                elem_hls_type,
+                elem_bits,
+                npy_type,
+                self.hls_sname(),
+                oshape_cpp_str,
+                npy_out,
+            )
+        ]
+
+    def save_as_npy(self):
+        self.code_gen_dict["$SAVEASCNPY$"] = []
+
+    def blackboxfunction(self):
+        self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
+            """void {}(hls::stream<ap_uint<{}>> &in0_{}, hls::stream<ap_uint<{}>> &in1_{},
+                hls::stream<ap_uint<{}>> &out_{})""".format(
+                self.onnx_node.name,
+                self.get_nodeattr("PE") * self.get_input_datatype(0).bitwidth(),
+                self.hls_sname(),
+                self.get_nodeattr("PE") * self.get_input_datatype(1).bitwidth(),
+                self.hls_sname(),
+                self.get_nodeattr("PE") * self.get_output_datatype().bitwidth(),
+                self.hls_sname(),
+            )
+        ]
+
+    def pragmas(self):
+        self.code_gen_dict["$PRAGMAS$"] = [
+            "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname()
+        ]
+        self.code_gen_dict["$PRAGMAS$"].append(
+            "#pragma HLS INTERFACE axis port=in1_" + self.hls_sname()
+        )
+        self.code_gen_dict["$PRAGMAS$"].append(
+            "#pragma HLS INTERFACE axis port=out_" + self.hls_sname()
+        )
+        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return")
+
+    def get_verilog_top_module_intf_names(self):
+        intf_names = super().get_verilog_top_module_intf_names()
+        sname = self.hls_sname()
+        swidth = self.get_instream_width_padded()
+        intf_names["s_axis"] = [(x + "_" + sname, swidth) for x in ["in0", "in1"]]
+        return intf_names
diff --git a/src/finn/custom_op/fpgadataflow/fmpadding_batch.py b/src/finn/custom_op/fpgadataflow/fmpadding_batch.py
index d69ea471ea..5bd5e07916 100644
--- a/src/finn/custom_op/fpgadataflow/fmpadding_batch.py
+++ b/src/finn/custom_op/fpgadataflow/fmpadding_batch.py
@@ -39,18 +39,14 @@ class FMPadding_Batch(HLSCustomOp):
     """Corresponds to finn-hlslib FMPadding_Batch function.
     Pads input image by given amount."""
 
-    def __init__(self, onnx_node):
-        super().__init__(onnx_node)
+    def __init__(self, onnx_node, **kwargs):
+        super().__init__(onnx_node, **kwargs)
 
     def get_nodeattr_types(self):
         my_attrs = {
             # spatial size of input images
             "ImgDim": ("ints", True, []),  # [H, W] = [Y, X]
             # total padding (per dimension) to apply
-            # NOTE: Current padding scheme that is applied tries to pad the same
-            # amount of zeros in front and behind the image for each dimension.
-            # As an example, a padding scheme such as [1, x, 3, x] is equal
-            # to [2, x, 2, x]
             "Padding": (
                 "ints",
                 True,
@@ -62,10 +58,6 @@ def get_nodeattr_types(self):
             "SIMD": ("i", False, 1),
             # FINN input datatype
             "inputDataType": ("s", True, ""),
-            # controls distribution of padded pixels
-            # in case of uneven padding -- see FMPadding fxn
-            # in hlslib
-            "PaddingStyle": ("i", False, 2, {2, 1}),
             # shape describing input vecs per execution
             "numInputVectors": ("i", False, 1),
         }
@@ -90,20 +82,20 @@ def get_exp_cycles(self):
         exp_cycles = (channels / simd) * batch_size * odim_h * odim_w
         return int(exp_cycles)
 
-    def get_normal_input_shape(self):
+    def get_normal_input_shape(self, ind=0):
         idim_h, idim_w = self.get_nodeattr("ImgDim")
         num_ch = self.get_nodeattr("NumChannels")
         ishape = (1, idim_h, idim_w, num_ch)
         return ishape
 
-    def get_normal_output_shape(self):
+    def get_normal_output_shape(self, ind=0):
         odim_h, odim_w = self.get_padded_odim()
         num_ch = self.get_nodeattr("NumChannels")
 
         oshape = (1, odim_h, odim_w, num_ch)
         return oshape
 
-    def get_folded_input_shape(self):
+    def get_folded_input_shape(self, ind=0):
         normal_ishape = list(self.get_normal_input_shape())
         ifm_ch = self.get_nodeattr("NumChannels")
         simd = self.get_nodeattr("SIMD")
@@ -112,7 +104,7 @@ def get_folded_input_shape(self):
         folded_ishape = normal_ishape[:-1] + [fold, simd]
         return tuple(folded_ishape)
 
-    def get_folded_output_shape(self):
+    def get_folded_output_shape(self, ind=0):
         normal_oshape = list(self.get_normal_output_shape())
         ifm_ch = self.get_nodeattr("NumChannels")
         simd = self.get_nodeattr("SIMD")
@@ -144,7 +136,7 @@ def infer_node_datatype(self, model):
     def verify_node(self):
         pass
 
-    def get_input_datatype(self):
+    def get_input_datatype(self, ind=0):
         """Returns FINN DataType of input."""
         ret = DataType[self.get_nodeattr("inputDataType")]
         # the hlslib op always pads with zeros, so ensure that the DataType
@@ -152,16 +144,16 @@ def get_input_datatype(self):
         assert ret.allowed(0), "FMPadding_Batch DataType must support zero"
         return ret
 
-    def get_output_datatype(self):
+    def get_output_datatype(self, ind=0):
         """Returns FINN DataType of output. (Same as input datatype)"""
         return self.get_input_datatype()
 
-    def get_instream_width(self):
+    def get_instream_width(self, ind=0):
         ibits = self.get_input_datatype().bitwidth()
         simd = self.get_nodeattr("SIMD")
         return ibits * simd
 
-    def get_outstream_width(self):
+    def get_outstream_width(self, ind=0):
         obits = self.get_output_datatype().bitwidth()
         simd = self.get_nodeattr("SIMD")
         return obits * simd
@@ -179,23 +171,21 @@ def defines(self, var):
         pad = self.get_nodeattr("Padding")
         pad_h = pad[0] + pad[2]
         pad_w = pad[1] + pad[3]
-        is_square = idim_h == idim_w
+        is_square_img = idim_h == idim_w
+        is_square_pad = pad_h == pad_w
 
-        if is_square:
-            assert (
-                pad_h == pad_w
-            ), "Only equal padding along the dimensions for square images is supported"
+        if is_square_img and is_square_pad:
             self.code_gen_dict["$DEFINES$"] = [
                 """#define ImgDim1 {}\n#define OutputDim1 {}\n
-                #define Padding1 {}\n#define NumChannels1 {}\n
-                #define SIMD1 {}\n#define PaddingStyle1 {}\n
+                #define PaddingBefore1 {}\n#define PaddingBehind1 {}\n
+                #define NumChannels1 {}\n#define SIMD1 {}\n
                 #define numReps {}\n""".format(
                     idim_h,
                     odim_h,
-                    pad_h,
+                    pad[0],
+                    pad[2],
                     self.get_nodeattr("NumChannels"),
                     self.get_nodeattr("SIMD"),
-                    self.get_nodeattr("PaddingStyle"),
                     self.get_nodeattr("numInputVectors"),
                 )
             ]
@@ -204,20 +194,22 @@ def defines(self, var):
                 """
                 #define OutputDim1_x {}\n
                 #define OutputDim1_y {}\n
-                #define Padding1_x {}\n
-                #define Padding1_y {}\n
+                #define PaddingLeft1 {}\n
+                #define PaddingRight1 {}\n
+                #define PaddingTop1 {}\n
+                #define PaddingBottom1 {}\n
                 #define NumChannels1 {}\n
                 #define SIMD1 {}\n
-                #define PaddingStyle1 {}\n
                 #define numReps {}\n
                 """.format(
                     odim_w,
                     odim_h,
-                    pad_w,
-                    pad_h,
+                    pad[1],
+                    pad[3],
+                    pad[0],
+                    pad[2],
                     self.get_nodeattr("NumChannels"),
                     self.get_nodeattr("SIMD"),
-                    self.get_nodeattr("PaddingStyle"),
                     self.get_nodeattr("numInputVectors"),
                 )
             ]
@@ -236,17 +228,28 @@ def read_npy_data(self):
         npy_in = "%s/input_0.npy" % code_gen_dir
         self.code_gen_dict["$READNPYDATA$"] = []
         self.code_gen_dict["$READNPYDATA$"].append(
-            'npy2apintstream<%s, %s, %d, %s>("%s", in0);'
-            % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in)
+            'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s);'
+            % (
+                packed_hls_type,
+                elem_hls_type,
+                elem_bits,
+                npy_type,
+                npy_in,
+                self.hls_sname(),
+            )
         )
 
     def strm_decl(self):
         self.code_gen_dict["$STREAMDECLARATIONS$"] = []
         self.code_gen_dict["$STREAMDECLARATIONS$"].append(
-            'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.get_instream_width())
+            'hls::stream<ap_uint<{}>> in0_{} ("in0_{}");'.format(
+                self.get_instream_width(), self.hls_sname(), self.hls_sname()
+            )
         )
         self.code_gen_dict["$STREAMDECLARATIONS$"].append(
-            'hls::stream<ap_uint<{}>> out ("out");'.format(self.get_outstream_width())
+            'hls::stream<ap_uint<{}>> out_{} ("out_{}");'.format(
+                self.get_outstream_width(), self.hls_sname(), self.hls_sname()
+            )
         )
 
     def docompute(self):
@@ -254,22 +257,27 @@ def docompute(self):
         node = self.onnx_node
 
         idim_h, idim_w = self.get_nodeattr("ImgDim")
-        is_square = idim_h == idim_w
+        pad = self.get_nodeattr("Padding")
+        pad_h = pad[0] + pad[2]
+        pad_w = pad[1] + pad[3]
+        is_square_img = idim_h == idim_w
+        is_square_pad = pad_h == pad_w
 
-        if is_square:
+        if is_square_img and is_square_pad:
             hls_call = node.op_type
             self.code_gen_dict["$DOCOMPUTE$"] = [
-                """{}<ImgDim1, OutputDim1, Padding1, NumChannels1,SIMD1,
-                {}, PaddingStyle1> (in0, out, numReps);""".format(
-                    hls_call, in_t
+                """{}<ImgDim1, OutputDim1, PaddingBefore1, PaddingBehind1, NumChannels1, SIMD1,
+                {}> (in0_{}, out_{}, numReps);""".format(
+                    hls_call, in_t, self.hls_sname(), self.hls_sname()
                 )
             ]
         else:
             hls_call = "FMPadding_nonsquare_Batch"
             self.code_gen_dict["$DOCOMPUTE$"] = [
-                """{}<OutputDim1_x, OutputDim1_y, Padding1_x, Padding1_y, NumChannels1,
-                SIMD1, {}, PaddingStyle1> (in0, out, numReps);""".format(
-                    hls_call, in_t
+                """{}<OutputDim1_x, OutputDim1_y, PaddingLeft1, PaddingRight1,
+                PaddingTop1, PaddingBottom1, NumChannels1,
+                SIMD1, {}> (in0_{}, out_{}, numReps);""".format(
+                    hls_call, in_t, self.hls_sname(), self.hls_sname()
                 )
             ]
 
@@ -289,12 +297,13 @@ def dataoutstrm(self):
         oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}")
 
         self.code_gen_dict["$DATAOUTSTREAM$"] = [
-            'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s");'
+            'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s");'
             % (
                 packed_hls_type,
                 elem_hls_type,
                 elem_bits,
                 npy_type,
+                self.hls_sname(),
                 oshape_cpp_str,
                 npy_out,
             )
@@ -307,20 +316,24 @@ def blackboxfunction(self):
         packed_bits = self.get_instream_width()
         packed_hls_type = "ap_uint<%d>" % packed_bits
         self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
-            "void %s(hls::stream<%s > &in0, hls::stream<%s > &out)"
-            % (self.onnx_node.name, packed_hls_type, packed_hls_type)
+            "void %s(hls::stream<%s > &in0_%s, hls::stream<%s > &out_%s)"
+            % (
+                self.onnx_node.name,
+                packed_hls_type,
+                self.hls_sname(),
+                packed_hls_type,
+                self.hls_sname(),
+            )
         ]
 
     def pragmas(self):
         self.code_gen_dict["$PRAGMAS$"] = [
-            "#pragma HLS INTERFACE axis port=in0 name=in0_" + self.hls_sname()
+            "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname()
         ]
         self.code_gen_dict["$PRAGMAS$"].append(
-            "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname()
-        )
-        self.code_gen_dict["$PRAGMAS$"].append(
-            "#pragma HLS INTERFACE ap_ctrl_none port=return"
+            "#pragma HLS INTERFACE axis port=out_" + self.hls_sname()
         )
+        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return")
 
     def execute_node(self, context, graph):
         mode = self.get_nodeattr("exec_mode")
diff --git a/src/finn/custom_op/fpgadataflow/fmpadding_rtl.py b/src/finn/custom_op/fpgadataflow/fmpadding_rtl.py
new file mode 100644
index 0000000000..d79c214730
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/fmpadding_rtl.py
@@ -0,0 +1,414 @@
+# Copyright (C) 2022, Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import math
+import numpy as np
+import os
+import shutil
+import warnings
+from qonnx.core.datatype import DataType
+from qonnx.util.basic import roundup_to_integer_multiple
+
+from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
+from finn.util.basic import get_rtlsim_trace_depth, make_build_dir
+from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
+
+try:
+    from pyverilator import PyVerilator
+except ModuleNotFoundError:
+    PyVerilator = None
+
+
+class FMPadding_rtl(HLSCustomOp):
+    """CustomOp wrapper for the finn-rtllib fmpadding_axi component
+    Supports adjusting the padding amount and spatial feature sizes at
+    runtime."""
+
+    def __init__(self, onnx_node, **kwargs):
+        super().__init__(onnx_node, **kwargs)
+
+    def get_nodeattr_types(self):
+        my_attrs = {
+            # spatial size of input images
+            "ImgDim": ("ints", True, []),  # [H, W] = [Y, X]
+            # total padding (per dimension) to apply
+            "Padding": (
+                "ints",
+                True,
+                [1, 1, 1, 1],
+            ),  # [H_begin, W_begin, H_end, W_end] = [Y_begin, X_begin, Y_end, X_end]
+            # number of channels in input image
+            "NumChannels": ("i", True, 0),
+            # SIMD Input parallelism
+            "SIMD": ("i", False, 1),
+            # FINN input datatype
+            "inputDataType": ("s", True, ""),
+            # shape describing input vecs per execution
+            "numInputVectors": ("i", False, 1),
+            # Enable reprogrammable implementation to change FM dimensions,
+            # stride, or dilation during runtime
+            "dynamic_mode": ("i", False, 0, {0, 1}),
+            # attribute to save top module name - not user configurable
+            "gen_top_module": ("s", False, ""),
+        }
+        my_attrs.update(super().get_nodeattr_types())
+        return my_attrs
+
+    def get_padded_odim(self):
+        "Return the padded spatial size of the output."
+        idim_h, idim_w = self.get_nodeattr("ImgDim")
+        pad = self.get_nodeattr("Padding")
+        pad_h = pad[0] + pad[2]
+        pad_w = pad[1] + pad[3]
+        odim_h = idim_h + pad_h
+        odim_w = idim_w + pad_w
+        return [odim_h, odim_w]
+
+    def get_exp_cycles(self):
+        odim_h, odim_w = self.get_padded_odim()
+        channels = self.get_nodeattr("NumChannels")
+        simd = self.get_nodeattr("SIMD")
+        batch_size = self.get_nodeattr("numInputVectors")
+        exp_cycles = (channels / simd) * batch_size * odim_h * odim_w
+        return int(exp_cycles)
+
+    def get_normal_input_shape(self, ind=0):
+        idim_h, idim_w = self.get_nodeattr("ImgDim")
+        num_ch = self.get_nodeattr("NumChannels")
+        ishape = (1, idim_h, idim_w, num_ch)
+        return ishape
+
+    def get_normal_output_shape(self, ind=0):
+        odim_h, odim_w = self.get_padded_odim()
+        num_ch = self.get_nodeattr("NumChannels")
+
+        oshape = (1, odim_h, odim_w, num_ch)
+        return oshape
+
+    def get_folded_input_shape(self, ind=0):
+        normal_ishape = list(self.get_normal_input_shape())
+        ifm_ch = self.get_nodeattr("NumChannels")
+        simd = self.get_nodeattr("SIMD")
+        assert ifm_ch % simd == 0, "SIMD must divide input channels"
+        fold = int(normal_ishape[-1] / simd)
+        folded_ishape = normal_ishape[:-1] + [fold, simd]
+        return tuple(folded_ishape)
+
+    def get_folded_output_shape(self, ind=0):
+        normal_oshape = list(self.get_normal_output_shape())
+        ifm_ch = self.get_nodeattr("NumChannels")
+        simd = self.get_nodeattr("SIMD")
+        assert ifm_ch % simd == 0, "SIMD must divide input channels"
+        fold = int(normal_oshape[-1] / simd)
+        folded_oshape = normal_oshape[:-1] + [fold, simd]
+        return tuple(folded_oshape)
+
+    def make_shape_compatible_op(self, model):
+        exp_ishape = self.get_normal_input_shape()
+        oshape = self.get_normal_output_shape()
+        ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0]))
+        assert ishape == exp_ishape, "Unexpected input shape for FMPadding_rtl."
+        return super().make_const_shape_op(oshape)
+
+    def infer_node_datatype(self, model):
+        node = self.onnx_node
+        idt = model.get_tensor_datatype(node.input[0])
+        if idt != self.get_input_datatype():
+            warn_str = "inputDataType changing for %s: %s -> %s " % (
+                node.name,
+                str(self.get_input_datatype()),
+                str(idt),
+            )
+            warnings.warn(warn_str)
+        self.set_nodeattr("inputDataType", idt.name)
+        model.set_tensor_datatype(node.output[0], idt)
+
+    def verify_node(self):
+        pass
+
+    def get_input_datatype(self, ind=0):
+        """Returns FINN DataType of input."""
+        ret = DataType[self.get_nodeattr("inputDataType")]
+        # the hlslib op always pads with zeros, so ensure that the DataType
+        # is able to represent zeros
+        assert ret.allowed(0), "FMPadding_rtl DataType must support zero"
+        return ret
+
+    def get_output_datatype(self, ind=0):
+        """Returns FINN DataType of output. (Same as input datatype)"""
+        return self.get_input_datatype()
+
+    def get_instream_width(self, ind=0):
+        ibits = self.get_input_datatype().bitwidth()
+        simd = self.get_nodeattr("SIMD")
+        return ibits * simd
+
+    def get_outstream_width(self, ind=0):
+        obits = self.get_output_datatype().bitwidth()
+        simd = self.get_nodeattr("SIMD")
+        return obits * simd
+
+    def get_number_output_values(self):
+        folded_oshape = self.get_folded_output_shape()
+        return np.prod(folded_oshape[:-1])
+
+    def get_verilog_top_module_intf_names(self):
+        # Overload default HLSCustomOp implementation to add axilite control IF
+        intf_names = super().get_verilog_top_module_intf_names()
+        if self.get_nodeattr("dynamic_mode"):
+            intf_names["axilite"] = ["s_axilite"]
+        return intf_names
+
+    def execute_node(self, context, graph):
+        mode = self.get_nodeattr("exec_mode")
+        node = self.onnx_node
+        exp_ishape = self.get_normal_input_shape()
+        exp_oshape = self.get_normal_output_shape()
+        folded_ishape = self.get_folded_input_shape()
+
+        if mode == "cppsim":
+            raise Exception("cppsim not possible for FMPadding_rtl, please set exec_mode to rtlsim")
+        elif mode == "rtlsim":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+                    mode
+                )
+            )
+
+        inp = context[node.input[0]]
+        assert str(inp.dtype) == "float32", "Input datatype is not float32"
+        assert (
+            inp.shape == exp_ishape
+        ), """Input shape doesn't
+        match expected shape (1, ImgDim_h, ImgDim_w, NumChannels)."""
+        export_idt = self.get_input_datatype()
+
+        reshaped_input = inp.reshape(folded_ishape)
+        np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input)
+
+        sim = self.get_rtlsim()
+        nbits = self.get_instream_width()
+        rtlsim_inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits)
+        super().reset_rtlsim(sim)
+        super().toggle_clk(sim)
+        rtlsim_output = self.rtlsim(sim, rtlsim_inp)
+        odt = export_idt
+        target_bits = odt.bitwidth()
+        packed_bits = self.get_outstream_width()
+        out_npy_path = "{}/output.npy".format(code_gen_dir)
+        out_shape = self.get_folded_output_shape()
+        rtlsim_output_to_npy(rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits)
+        # load and reshape output
+        output = np.load(out_npy_path)
+        output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape)
+        context[node.output[0]] = output
+
+        assert (
+            context[node.output[0]].shape == exp_oshape
+        ), """Output shape doesn't match expected shape
+            (1, OutputDim_H, OutputDim_W, NumChannels)."""
+
+    def get_template_values(self, ifm_dims, pads, chans, simd, idt):
+        dimY, dimX = ifm_dims
+        padT, padL, padB, padR = pads
+        y_counter_bits = int(math.ceil(math.log2(padT + dimY + padB + 1)))
+        x_counter_bits = int(math.ceil(math.log2(padL + dimX + padR + 1)))
+        topname = self.get_verilog_top_module_name()
+        stream_bits = idt.bitwidth() * simd
+        stream_bits = int(roundup_to_integer_multiple(stream_bits, 8))
+        code_gen_dict = {
+            "XCOUNTER_BITS": int(x_counter_bits),
+            "YCOUNTER_BITS": int(y_counter_bits),
+            "NUM_CHANNELS": int(chans),
+            "SIMD": int(simd),
+            "ELEM_BITS": idt.bitwidth(),
+            "TOP_MODULE_NAME": topname,
+            "INIT_XON": int(padL),
+            "INIT_XOFF": int(padL + dimX),
+            "INIT_XEND": int(padL + dimX + padR - 1),
+            "INIT_YON": int(padT),
+            "INIT_YOFF": int(padT + dimY),
+            "INIT_YEND": int(padT + dimY + padB - 1),
+            "STREAM_BITS": int(stream_bits),
+        }
+        return code_gen_dict
+
+    def get_dynamic_config(self, ifm_dims=None, pads=None):
+        """Returns a configuration dict to re-configure FM dimension and
+        padding amounts during runtime."""
+
+        if ifm_dims is None:
+            ifm_dims = self.get_nodeattr("ImgDim")
+        if pads is None:
+            pads = self.get_nodeattr("Padding")
+        chans = self.get_nodeattr("NumChannels")
+        simd = self.get_nodeattr("SIMD")
+        idt = self.get_input_datatype()
+        code_gen_dict = self.get_template_values(ifm_dims, pads, chans, simd, idt)
+        config = {
+            "XON": (0 * 4, (code_gen_dict["INIT_XON"])),
+            "XOFF": (1 * 4, (code_gen_dict["INIT_XOFF"])),
+            "XEND": (2 * 4, (code_gen_dict["INIT_XEND"])),
+            "YON": (3 * 4, (code_gen_dict["INIT_YON"])),
+            "YOFF": (4 * 4, (code_gen_dict["INIT_YOFF"])),
+            "YEND": (5 * 4, (code_gen_dict["INIT_YEND"])),
+        }
+        return config
+
+    def generate_hdl(self):
+        rtlsrc = os.environ["FINN_ROOT"] + "/finn-rtllib/fmpadding/hdl"
+        template_path = rtlsrc + "/fmpadding_template.v"
+        dims = self.get_nodeattr("ImgDim")
+        pads = self.get_nodeattr("Padding")
+        chans = self.get_nodeattr("NumChannels")
+        simd = self.get_nodeattr("SIMD")
+        idt = self.get_input_datatype()
+        code_gen_dict = self.get_template_values(dims, pads, chans, simd, idt)
+        # save top module name so we can refer to it after this node has been renamed
+        # (e.g. by GiveUniqueNodeNames(prefix) during MakeZynqProject)
+        self.set_nodeattr("gen_top_module", self.get_verilog_top_module_name())
+
+        # apply code generation to templates
+        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        with open(template_path, "r") as f:
+            template = f.read()
+        for key_name in code_gen_dict:
+            key = "$%s$" % key_name
+            template = template.replace(key, str(code_gen_dict[key_name]))
+
+        with open(
+            os.path.join(code_gen_dir, self.get_verilog_top_module_name() + ".v"),
+            "w",
+        ) as f:
+            f.write(template)
+
+        sv_files = ["fmpadding_axi.sv", "fmpadding.sv", "axi2we.sv"]
+        for sv_file in sv_files:
+            shutil.copy(rtlsrc + "/" + sv_file, code_gen_dir)
+        # set ipgen_path and ip_path so that HLS-Synth transformation
+        # and stich_ip transformation do not complain
+        self.set_nodeattr("ipgen_path", code_gen_dir)
+        self.set_nodeattr("ip_path", code_gen_dir)
+
+    def prepare_rtlsim(self):
+        """Creates a Verilator emulation library for the RTL code generated
+        for this node, sets the rtlsim_so attribute to its path and returns
+        a PyVerilator wrapper around it."""
+        # Modified to use generated (System-)Verilog instead of HLS output products
+
+        if PyVerilator is None:
+            raise ImportError("Installation of PyVerilator is required.")
+
+        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        verilog_paths = [code_gen_dir]
+        verilog_files = [
+            "fmpadding_axi.sv",
+            "fmpadding.sv",
+            "axi2we.sv",
+            self.get_nodeattr("gen_top_module") + ".v",
+        ]
+
+        # build the Verilator emu library
+        sim = PyVerilator.build(
+            verilog_files,
+            build_dir=make_build_dir("pyverilator_" + self.onnx_node.name + "_"),
+            verilog_path=verilog_paths,
+            trace_depth=get_rtlsim_trace_depth(),
+            top_module_name=self.get_verilog_top_module_name(),
+        )
+        # save generated lib filename in attribute
+        self.set_nodeattr("rtlsim_so", sim.lib._name)
+        return sim
+
+    def code_generation_ipi(self):
+        """Constructs and returns the TCL for node instantiation in Vivado IPI."""
+        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+
+        sourcefiles = [
+            "fmpadding_axi.sv",
+            "fmpadding.sv",
+            "axi2we.sv",
+            self.get_nodeattr("gen_top_module") + ".v",
+        ]
+
+        sourcefiles = [os.path.join(code_gen_dir, f) for f in sourcefiles]
+
+        cmd = []
+        for f in sourcefiles:
+            cmd += ["add_files -norecurse %s" % (f)]
+        cmd += [
+            "create_bd_cell -type module -reference %s %s"
+            % (self.get_nodeattr("gen_top_module"), self.onnx_node.name)
+        ]
+        return cmd
+
+    def code_generation_ipgen(self, model, fpgapart, clk):
+        """Normally: Generates C++ code and tcl script for IP generation.
+        Here: Generates (System-)Verilog code for IP generation."""
+        self.generate_hdl()
+
+    def ipgen_singlenode_code(self):
+        """Normally: Builds the bash script for IP generation."""
+        pass
+
+    def code_generation_cppsim(self, model):
+        """Normally: Generates C++ code for simulation (cppsim)."""
+        pass
+
+    def compile_singlenode_code(self):
+        pass
+
+    def global_includes(self):
+        pass
+
+    def defines(self, var):
+        pass
+
+    def read_npy_data(self):
+        pass
+
+    def strm_decl(self):
+        pass
+
+    def docompute(self):
+        pass
+
+    def dataoutstrm(self):
+        pass
+
+    def save_as_npy(self):
+        pass
+
+    def blackboxfunction(self):
+        pass
+
+    def pragmas(self):
+        pass
diff --git a/src/finn/custom_op/fpgadataflow/globalaccpool_batch.py b/src/finn/custom_op/fpgadataflow/globalaccpool_batch.py
index adafa7dcf3..5ed440dace 100644
--- a/src/finn/custom_op/fpgadataflow/globalaccpool_batch.py
+++ b/src/finn/custom_op/fpgadataflow/globalaccpool_batch.py
@@ -38,8 +38,8 @@
 class GlobalAccPool_Batch(HLSCustomOp):
     """Class that corresponds to finn-hlslib AccPool_Batch function."""
 
-    def __init__(self, onnx_node):
-        super().__init__(onnx_node)
+    def __init__(self, onnx_node, **kwargs):
+        super().__init__(onnx_node, **kwargs)
 
     def get_nodeattr_types(self):
         my_attrs = {
@@ -56,13 +56,13 @@ def get_nodeattr_types(self):
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs
 
-    def get_normal_input_shape(self):
+    def get_normal_input_shape(self, ind=0):
         ch = self.get_nodeattr("NumChannels")
         vecs = list(self.get_nodeattr("numInputVectors"))
         ishape = tuple(vecs + [ch])
         return ishape
 
-    def get_folded_input_shape(self):
+    def get_folded_input_shape(self, ind=0):
         ch = self.get_nodeattr("NumChannels")
         pe = self.get_nodeattr("PE")
         vecs = list(self.get_nodeattr("numInputVectors"))
@@ -71,7 +71,7 @@ def get_folded_input_shape(self):
         folded_ishape = tuple(vecs + [folds, pe])
         return folded_ishape
 
-    def get_normal_output_shape(self):
+    def get_normal_output_shape(self, ind=0):
         ch = self.get_nodeattr("NumChannels")
         vecs = list(self.get_nodeattr("numInputVectors"))
         if len(vecs) == 1:
@@ -80,7 +80,7 @@ def get_normal_output_shape(self):
             oshape = tuple([vecs[0]] + [1, 1, ch])
         return oshape
 
-    def get_folded_output_shape(self):
+    def get_folded_output_shape(self, ind=0):
         ch = self.get_nodeattr("NumChannels")
         pe = self.get_nodeattr("PE")
         unfolded_shape = list(self.get_normal_output_shape())
@@ -128,9 +128,7 @@ def verify_node(self):
             self.get_nodeattr("inputDataType")
             info_messages.append("All necessary attributes exist")
         except Exception:
-            info_messages.append(
-                """The required GlobalAccPool_Batch attributes do not exist."""
-            )
+            info_messages.append("""The required GlobalAccPool_Batch attributes do not exist.""")
 
         # verify that input data is 2D
         if len(self.get_nodeattr("numInputVectors")) != 3:
@@ -139,11 +137,11 @@ def verify_node(self):
 
         return info_messages
 
-    def get_input_datatype(self):
+    def get_input_datatype(self, ind=0):
         """Returns FINN DataType of input."""
         return DataType[self.get_nodeattr("inputDataType")]
 
-    def get_output_datatype(self):
+    def get_output_datatype(self, ind=0):
         """Returns FINN DataType of output."""
         # determine data type from image size and input type
         idt = DataType[self.get_nodeattr("inputDataType")]
@@ -155,14 +153,14 @@ def get_output_datatype(self):
             extreme_value = npixels * idt.max()
         return DataType.get_smallest_possible(extreme_value)
 
-    def get_instream_width(self):
+    def get_instream_width(self, ind=0):
         """Returns input stream width."""
         ibits = self.get_input_datatype().bitwidth()
         pe = self.get_nodeattr("PE")
         in_width = pe * ibits
         return in_width
 
-    def get_outstream_width(self):
+    def get_outstream_width(self, ind=0):
         """Returns output stream width."""
         obits = self.get_output_datatype().bitwidth()
         pe = self.get_nodeattr("PE")
@@ -267,27 +265,40 @@ def read_npy_data(self):
         npy_in = "%s/input_0.npy" % code_gen_dir
         self.code_gen_dict["$READNPYDATA$"] = []
         self.code_gen_dict["$READNPYDATA$"].append(
-            'npy2apintstream<%s, %s, %d, %s>("%s", in0);'
-            % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in)
+            'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s);'
+            % (
+                packed_hls_type,
+                elem_hls_type,
+                elem_bits,
+                npy_type,
+                npy_in,
+                self.hls_sname(),
+            )
         )
 
     def strm_decl(self):
         self.code_gen_dict["$STREAMDECLARATIONS$"] = []
         self.code_gen_dict["$STREAMDECLARATIONS$"].append(
-            'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.get_instream_width())
+            'hls::stream<ap_uint<{}>> in0_{} ("in0_{}");'.format(
+                self.get_instream_width(), self.hls_sname(), self.hls_sname()
+            )
         )
         self.code_gen_dict["$STREAMDECLARATIONS$"].append(
-            'hls::stream<ap_uint<{}>> out ("out");'.format(self.get_outstream_width())
+            'hls::stream<ap_uint<{}>> out_{} ("out_{}");'.format(
+                self.get_outstream_width(), self.hls_sname(), self.hls_sname()
+            )
         )
 
     def docompute(self):
         self.code_gen_dict["$DOCOMPUTE$"] = [
-            """AccPool_Batch<{}, {}, {}, {}, {}> (in0, out, 1);""".format(
+            """AccPool_Batch<{}, {}, {}, {}, {}> (in0_{}, out_{}, 1);""".format(
                 self.get_normal_input_shape()[1],
                 self.get_nodeattr("NumChannels"),
                 self.get_input_datatype().get_hls_datatype_str(),
                 self.get_nodeattr("PE"),
                 self.get_output_datatype().get_hls_datatype_str(),
+                self.hls_sname(),
+                self.hls_sname(),
             )
         ]
 
@@ -304,12 +315,13 @@ def dataoutstrm(self):
         oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}")
 
         self.code_gen_dict["$DATAOUTSTREAM$"] = [
-            'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s");'
+            'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s");'
             % (
                 packed_hls_type,
                 elem_hls_type,
                 elem_bits,
                 npy_type,
+                self.hls_sname(),
                 oshape_cpp_str,
                 npy_out,
             )
@@ -320,21 +332,21 @@ def save_as_npy(self):
 
     def blackboxfunction(self):
         self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
-            """void {}(hls::stream<ap_uint<{}>> &in0,
-                hls::stream<ap_uint<{}>> &out)""".format(
+            """void {}(hls::stream<ap_uint<{}>> &in0_{},
+                hls::stream<ap_uint<{}>> &out_{})""".format(
                 self.onnx_node.name,
                 self.get_instream_width(),
+                self.hls_sname(),
                 self.get_outstream_width(),
+                self.hls_sname(),
             )
         ]
 
     def pragmas(self):
         self.code_gen_dict["$PRAGMAS$"] = [
-            "#pragma HLS INTERFACE axis port=in0 name=in0_" + self.hls_sname()
+            "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname()
         ]
         self.code_gen_dict["$PRAGMAS$"].append(
-            "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname()
-        )
-        self.code_gen_dict["$PRAGMAS$"].append(
-            "#pragma HLS INTERFACE ap_ctrl_none port=return"
+            "#pragma HLS INTERFACE axis port=out_" + self.hls_sname()
         )
+        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return")
diff --git a/src/finn/custom_op/fpgadataflow/hlscustomop.py b/src/finn/custom_op/fpgadataflow/hlscustomop.py
index 9978ab0c71..4fed8ed4b5 100644
--- a/src/finn/custom_op/fpgadataflow/hlscustomop.py
+++ b/src/finn/custom_op/fpgadataflow/hlscustomop.py
@@ -29,8 +29,9 @@
 import numpy as np
 import os
 import subprocess
+import warnings
 from abc import abstractmethod
-from pyverilator.util.axi_utils import rtlsim_multi_io
+from pyverilator.util.axi_utils import _read_signal, reset_rtlsim, rtlsim_multi_io
 from qonnx.core.datatype import DataType
 from qonnx.custom_op.base import CustomOp
 from qonnx.util.basic import roundup_to_integer_multiple
@@ -42,6 +43,7 @@
     pyverilate_get_liveness_threshold_cycles,
 )
 from finn.util.hls import CallHLS
+from finn.util.pyverilator import make_single_source_file
 
 from . import templates
 
@@ -57,8 +59,8 @@ class HLSCustomOp(CustomOp):
     custom node should have. Some as abstract methods, these have to be filled
     when writing a new fpgadataflow custom op node."""
 
-    def __init__(self, onnx_node):
-        super().__init__(onnx_node)
+    def __init__(self, onnx_node, **kwargs):
+        super().__init__(onnx_node, **kwargs)
 
         self.code_gen_dict = {}
 
@@ -107,10 +109,18 @@ def get_nodeattr_types(self):
             # ID of FPGA device to which this Op is allocated, in
             # a multi-FPGA setting
             "device_id": ("i", False, 0),
-            # input and output FIFO depths
-            "inFIFODepth": ("i", False, 2),
-            "outFIFODepth": ("i", False, 2),
+            # input and output FIFO depths for multi-I/O nodes
+            "inFIFODepths": ("ints", False, [2]),
+            "outFIFODepths": ("ints", False, [2]),
             "output_hook": ("s", False, ""),
+            # accumulated characteristic function over two periods
+            "io_chrc_in": ("t", False, np.asarray([], dtype=np.int32)),
+            "io_chrc_out": ("t", False, np.asarray([], dtype=np.int32)),
+            # the period for which the characterization was run
+            "io_chrc_period": ("i", False, 0),
+            # amount of zero padding inserted during chrc.
+            "io_chrc_pads_in": ("ints", False, []),
+            "io_chrc_pads_out": ("ints", False, []),
         }
 
     def get_verilog_top_module_name(self):
@@ -138,6 +148,7 @@ def get_verilog_top_module_intf_names(self):
         intf_names["m_axis"] = [("out_" + sname, self.get_outstream_width_padded())]
         intf_names["aximm"] = []
         intf_names["axilite"] = []
+        intf_names["ap_none"] = []
         return intf_names
 
     def get_verilog_top_filename(self):
@@ -158,13 +169,11 @@ def get_all_verilog_paths(self):
             code_gen_dir != ""
         ), """Node attribute "code_gen_dir_ipgen" is
         not set. Please run HLSSynthIP first."""
-        verilog_path = "{}/project_{}/sol1/impl/verilog/".format(
-            code_gen_dir, self.onnx_node.name
-        )
+        verilog_path = "{}/project_{}/sol1/impl/verilog/".format(code_gen_dir, self.onnx_node.name)
         # default impl only returns the HLS verilog codegen dir
         return [verilog_path]
 
-    def get_all_verilog_filenames(self):
+    def get_all_verilog_filenames(self, abspath=False):
         "Return list of all Verilog files used for this node."
 
         verilog_files = []
@@ -172,7 +181,10 @@ def get_all_verilog_filenames(self):
         for verilog_path in verilog_paths:
             for f in os.listdir(verilog_path):
                 if f.endswith(".v"):
-                    verilog_files += [f]
+                    if abspath:
+                        verilog_files += [verilog_path + "/" + f]
+                    else:
+                        verilog_files += [f]
         return verilog_files
 
     def prepare_rtlsim(self):
@@ -182,13 +194,18 @@ def prepare_rtlsim(self):
 
         if PyVerilator is None:
             raise ImportError("Installation of PyVerilator is required.")
-        verilog_paths = self.get_all_verilog_paths()
-        verilog_files = self.get_all_verilog_filenames()
+
+        verilog_files = self.get_all_verilog_filenames(abspath=True)
+        single_src_dir = make_build_dir("rtlsim_" + self.onnx_node.name + "_")
+        tmp_build_dir = make_build_dir("pyverilator_" + self.onnx_node.name + "_")
+        target_file = single_src_dir + "/" + self.get_verilog_top_module_name() + ".v"
+        make_single_source_file(verilog_files, target_file)
+
         # build the Verilator emu library
         sim = PyVerilator.build(
-            verilog_files,
-            build_dir=make_build_dir("pyverilator_" + self.onnx_node.name + "_"),
-            verilog_path=verilog_paths,
+            self.get_verilog_top_module_name() + ".v",
+            build_dir=tmp_build_dir,
+            verilog_path=[single_src_dir],
             trace_depth=get_rtlsim_trace_depth(),
             top_module_name=self.get_verilog_top_module_name(),
         )
@@ -336,9 +353,10 @@ def ipgen_singlenode_code(self):
         assert os.path.isdir(ipgen_path), "IPGen failed: %s not found" % (ipgen_path)
         self.set_nodeattr("ipgen_path", ipgen_path)
         ip_path = ipgen_path + "/sol1/impl/ip"
-        assert os.path.isdir(
-            ip_path
-        ), "IPGen failed: %s not found. Check log under %s" % (ip_path, code_gen_dir)
+        assert os.path.isdir(ip_path), "IPGen failed: %s not found. Check log under %s" % (
+            ip_path,
+            code_gen_dir,
+        )
         self.set_nodeattr("ip_path", ip_path)
         vlnv = "xilinx.com:hls:%s:1.0" % node.name
         self.set_nodeattr("ip_vlnv", vlnv)
@@ -397,18 +415,20 @@ def compile_singlenode_code(self):
         builder.build(code_gen_dir)
         self.set_nodeattr("executable_path", builder.executable_path)
 
-    def dynamic_input_to_npy(self, context, count):
+    def dynamic_input_to_npy(self, context, count, target_dir=""):
         """Saves input (given context) into .npy files.
 
         Count indicates the number of inputs that have to be saved."""
         node = self.onnx_node
-        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
-        if code_gen_dir == "":
-            raise Exception(
+        if target_dir == "":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+            if code_gen_dir == "":
+                raise Exception(
+                    """
+    Found no codegen dir for this node, did you run the prepare_cppsim transformation?
                 """
-Found no codegen dir for this node, did you run the prepare_cppsim transformation?
-            """
-            )
+                )
+            target_dir = code_gen_dir
         # create a npy file for each input of the node (in_ind is input index)
         # assuming dynamic inputs start from 0
         for in_ind in range(count):
@@ -427,7 +447,7 @@ def dynamic_input_to_npy(self, context, count):
             # make copy before saving the array
             reshaped_input = reshaped_input.copy()
             np.save(
-                os.path.join(code_gen_dir, "input_{}.npy".format(in_ind)),
+                os.path.join(target_dir, "input_{}.npy".format(in_ind)),
                 reshaped_input,
             )
 
@@ -685,40 +705,48 @@ def pragmas(self):
         HLSCustomOp class but has to be filled by every node."""
         pass
 
-    def get_normal_input_shape(self):
+    def get_input_datatype(self, ind=0):
+        """Returns FINN DataType of input stream ind."""
+        raise Exception("get_input_datatype not implemented for this op")
+
+    def get_output_datatype(self, ind=0):
+        """Returns FINN DataType of output stream ind."""
+        raise Exception("get_output_datatype not implemented for this op")
+
+    def get_normal_input_shape(self, ind=0):
         """Returns normal input shape if implemented."""
         raise Exception("get_normal_input_shape not implemented for this op")
 
-    def get_normal_output_shape(self):
+    def get_normal_output_shape(self, ind=0):
         """Returns folded output shape if implemented."""
         raise Exception("get_normal_output_shape not implemented for this op")
 
-    def get_folded_input_shape(self):
+    def get_folded_input_shape(self, ind=0):
         """Returns folded input shape (according to synapse folding), if implemented."""
         raise Exception("get_folded_input_shape not implemented for this op")
 
-    def get_folded_output_shape(self):
+    def get_folded_output_shape(self, ind=0):
         """Returns folded output shape (according to neuron folding), if implemented."""
         raise Exception("get_folded_output_shape not implemented for this op")
 
-    def get_instream_width(self):
+    def get_instream_width(self, ind=0):
         """Returns input stream width, if implemented."""
         raise Exception("get_instream_width not implemented for this op")
 
-    def get_outstream_width(self):
+    def get_outstream_width(self, ind=0):
         """Returns output stream width, if implemented."""
         raise Exception("get_outstream_width not implemented for this op")
 
-    def get_instream_width_padded(self):
+    def get_instream_width_padded(self, ind=0):
         """Returns input stream width padded to a multiple of 8. This is required
         by the AXI Stream spec."""
-        in_width = self.get_instream_width()
+        in_width = self.get_instream_width(ind=ind)
         return roundup_to_integer_multiple(in_width, 8)
 
-    def get_outstream_width_padded(self):
+    def get_outstream_width_padded(self, ind=0):
         """Returns output stream width padded to a multiple of 8. This is required
         by the AXI Stream spec."""
-        out_width = self.get_outstream_width()
+        out_width = self.get_outstream_width(ind=ind)
         return roundup_to_integer_multiple(out_width, 8)
 
     def get_ap_int_max_w(self):
@@ -727,7 +755,114 @@ def get_ap_int_max_w(self):
         instream = self.get_instream_width()
         outstream = self.get_outstream_width()
         ret = max([instream, outstream])
-        assert ret <= 32768, (
-            "AP_INT_MAX_W=%d is larger than allowed maximum of 32768" % ret
-        )
+        assert ret <= 32768, "AP_INT_MAX_W=%d is larger than allowed maximum of 32768" % ret
         return ret
+
+    def derive_characteristic_fxns(self, period, override_rtlsim_dict=None):
+        """Return the unconstrained characteristic functions for this node."""
+        # ensure rtlsim is ready
+        assert self.get_nodeattr("rtlsim_so") != "", "rtlsim not ready for " + self.onnx_node.name
+        if self.get_nodeattr("io_chrc_period") > 0:
+            warnings.warn("Skipping node %s: already has FIFO characteristic" % self.onnx_node.name)
+            return
+        exp_cycles = self.get_exp_cycles()
+        n_inps = np.prod(self.get_folded_input_shape()[:-1])
+        n_outs = np.prod(self.get_folded_output_shape()[:-1])
+        if exp_cycles == 0:
+            # try to come up with an optimistic estimate
+            exp_cycles = min(n_inps, n_outs)
+        assert (
+            exp_cycles <= period
+        ), "Period %d too short to characterize %s : expects min %d cycles" % (
+            period,
+            self.onnx_node.name,
+            exp_cycles,
+        )
+        sim = self.get_rtlsim()
+        # signal name
+        sname = "_" + self.hls_sname() + "_"
+        if override_rtlsim_dict is not None:
+            io_dict = override_rtlsim_dict
+        else:
+            io_dict = {
+                "inputs": {
+                    "in0": [0 for i in range(n_inps)],
+                },
+                "outputs": {"out": []},
+            }
+
+        # extra dicts to keep track of cycle-by-cycle transaction behavior
+        # note that we restrict key names to filter out weight streams etc
+        txns_in = {key: [] for (key, value) in io_dict["inputs"].items() if "in" in key}
+        txns_out = {key: [] for (key, value) in io_dict["outputs"].items() if "out" in key}
+
+        def monitor_txns(sim_obj):
+            for inp in txns_in:
+                in_ready = _read_signal(sim, inp + sname + "TREADY") == 1
+                in_valid = _read_signal(sim, inp + sname + "TVALID") == 1
+                if in_ready and in_valid:
+                    txns_in[inp].append(1)
+                else:
+                    txns_in[inp].append(0)
+            for outp in txns_out:
+                if (
+                    _read_signal(sim, outp + sname + "TREADY") == 1
+                    and _read_signal(sim, outp + sname + "TVALID") == 1
+                ):
+                    txns_out[outp].append(1)
+                else:
+                    txns_out[outp].append(0)
+
+        reset_rtlsim(sim)
+        total_cycle_count = rtlsim_multi_io(
+            sim,
+            io_dict,
+            n_outs,
+            sname=sname,
+            liveness_threshold=period,
+            hook_preclk=monitor_txns,
+        )
+        assert (
+            total_cycle_count <= period
+        ), """Total cycle count from rtl simulation is higher than
+            specified period, please set the period higher than {}""".format(
+            total_cycle_count
+        )
+        self.set_nodeattr("io_chrc_period", period)
+
+        def accumulate_char_fxn(chrc):
+            p = len(chrc)
+            ret = []
+            for t in range(2 * p):
+                if t == 0:
+                    ret.append(chrc[0])
+                else:
+                    ret.append(ret[-1] + chrc[t % p])
+            return np.asarray(ret, dtype=np.int32)
+
+        all_txns_in = np.empty((len(txns_in.keys()), 2 * period), dtype=np.int32)
+        all_txns_out = np.empty((len(txns_out.keys()), 2 * period), dtype=np.int32)
+        all_pad_in = []
+        all_pad_out = []
+        for in_idx, in_strm_nm in enumerate(txns_in.keys()):
+            txn_in = txns_in[in_strm_nm]
+            if len(txn_in) < period:
+                pad_in = period - len(txn_in)
+                txn_in += [0 for x in range(pad_in)]
+            txn_in = accumulate_char_fxn(txn_in)
+            all_txns_in[in_idx, :] = txn_in
+            all_pad_in.append(pad_in)
+
+        for out_idx, out_strm_nm in enumerate(txns_out.keys()):
+            txn_out = txns_out[out_strm_nm]
+            if len(txn_out) < period:
+                pad_out = period - len(txn_out)
+                txn_out += [0 for x in range(pad_out)]
+            txn_out = accumulate_char_fxn(txn_out)
+            all_txns_out[out_idx, :] = txn_out
+            all_pad_out.append(pad_out)
+
+        self.set_nodeattr("io_chrc_in", all_txns_in)
+        self.set_nodeattr("io_chrc_out", all_txns_out)
+        self.set_nodeattr("io_chrc_pads_in", all_pad_in)
+        self.set_nodeattr("io_chrc_pads_out", all_pad_out)
diff --git a/src/finn/custom_op/fpgadataflow/iodma.py b/src/finn/custom_op/fpgadataflow/iodma.py
index 33ee1d359c..bb3de268a0 100644
--- a/src/finn/custom_op/fpgadataflow/iodma.py
+++ b/src/finn/custom_op/fpgadataflow/iodma.py
@@ -47,7 +47,7 @@
 
 # Interfaces
 # - AXI-MM name specified by intfName unless this is set to "" (empty, the default)
-#   in which case output AXI-MM are named "out" and input AXI-MM are named "in0"
+#   in which case output AXI-MM are named "out_V" and input AXI-MM are named "in0_V"
 # - AXI-MM interface width (in bits) is specified by intfWidth
 # - AXI-Stream interface width (in bits) is specified by streamWidth
 # - If inftWidth and streamWidth are not equal, the DMA core performs
@@ -75,8 +75,8 @@
 class IODMA(HLSCustomOp):
     """Class that corresponds to finn-hlslib DMA function(s)."""
 
-    def __init__(self, onnx_node):
-        super().__init__(onnx_node)
+    def __init__(self, onnx_node, **kwargs):
+        super().__init__(onnx_node, **kwargs)
 
     def get_nodeattr_types(self):
         my_attrs = {
@@ -100,25 +100,23 @@ def get_nodeattr_types(self):
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs
 
-    def get_normal_input_shape(self):
+    def get_normal_input_shape(self, ind=0):
         vecs = list(self.get_nodeattr("numInputVectors"))
         num_ch = self.get_nodeattr("NumChannels")
         ishape = tuple(vecs + [num_ch])
         return ishape
 
-    def get_normal_output_shape(self):
+    def get_normal_output_shape(self, ind=0):
         return self.get_normal_input_shape()
 
-    def get_folded_input_shape(self):
+    def get_folded_input_shape(self, ind=0):
         if self.get_nodeattr("direction") == "in":
             raise ValueError("Folded input shape not defined for input IODMA")
         else:
             shape = list(self.get_normal_input_shape())
             itype_bits = self.get_input_datatype().bitwidth()
             intfw = self.get_nodeattr("streamWidth")
-            assert (
-                intfw % itype_bits == 0
-            ), "Input stream width must be a multiple of datatype bits"
+            assert intfw % itype_bits == 0, "Input stream width must be a multiple of datatype bits"
             elems_per_word = intfw // itype_bits
             assert shape[-1] % elems_per_word == 0, "Fold depth must be integer"
             fold_depth = shape[-1] // elems_per_word
@@ -126,16 +124,14 @@ def get_folded_input_shape(self):
             shape.append(elems_per_word)
             return tuple(shape)
 
-    def get_folded_output_shape(self):
+    def get_folded_output_shape(self, ind=0):
         if self.get_nodeattr("direction") == "out":
             raise ValueError("Folded output shape not defined for output IODMA")
         else:
             shape = list(self.get_normal_output_shape())
             itype_bits = self.get_output_datatype().bitwidth()
             intfw = self.get_nodeattr("streamWidth")
-            assert (
-                intfw % itype_bits == 0
-            ), "Input stream width must be a multiple of datatype bits"
+            assert intfw % itype_bits == 0, "Input stream width must be a multiple of datatype bits"
             elems_per_word = intfw // itype_bits
             assert shape[-1] % elems_per_word == 0, "Fold depth must be integer"
             fold_depth = shape[-1] // elems_per_word
@@ -166,15 +162,15 @@ def infer_node_datatype(self, model):
     def verify_node(self):
         pass
 
-    def get_input_datatype(self):
+    def get_input_datatype(self, ind=0):
         """Returns FINN DataType of input."""
         return DataType[self.get_nodeattr("dataType")]
 
-    def get_output_datatype(self):
+    def get_output_datatype(self, ind=0):
         """Returns FINN DataType of output. (Same as input datatype)"""
         return self.get_input_datatype()
 
-    def get_instream_width(self):
+    def get_instream_width(self, ind=0):
         if self.get_nodeattr("direction") == "in":
             return self.get_nodeattr("intfWidth")
         elif self.get_nodeattr("direction") == "out":
@@ -182,7 +178,7 @@ def get_instream_width(self):
         else:
             raise ValueError("Invalid IODMA direction, please set to in or out")
 
-    def get_outstream_width(self):
+    def get_outstream_width(self, ind=0):
         if self.get_nodeattr("direction") == "out":
             return self.get_nodeattr("intfWidth")
         elif self.get_nodeattr("direction") == "in":
@@ -196,9 +192,7 @@ def get_number_output_values(self):
         stream_width = self.get_nodeattr("streamWidth")
         nelems = np.prod(oshape)
         nbits = nelems * itype_bits
-        assert (
-            nbits % stream_width == 0
-        ), "DMA: total transfer size must be word multiple"
+        assert nbits % stream_width == 0, "DMA: total transfer size must be word multiple"
         ovalues = nbits // stream_width
         return ovalues
 
@@ -254,15 +248,23 @@ def docompute(self):
             # DWCs depend on AXI MM and out interface width
             if strmw == intfw:
                 # case 0: AXI MM width = out width, no DWCs needed
-                self.code_gen_dict["$DOCOMPUTE$"] = [dma_inst_template % ("in0", "out")]
+                self.code_gen_dict["$DOCOMPUTE$"] = [
+                    dma_inst_template % ("in0_" + self.hls_sname(), "out_" + self.hls_sname())
+                ]
             elif (strmw % intfw == 0) or (intfw % strmw == 0):
                 # case 1: AXI MM width divisible by out width or vice versa
                 # single DWC + single extra stream needed
                 self.code_gen_dict["$DOCOMPUTE$"] = [
                     "hls::stream<ap_uint<%d> > dma2dwc;" % intfw,
-                    dma_inst_template % ("in0", "dma2dwc"),
+                    dma_inst_template % ("in0_" + self.hls_sname(), "dma2dwc"),
                     dwc_inst_template
-                    % (intfw, strmw, total_bits // intfw, "dma2dwc", "out"),
+                    % (
+                        intfw,
+                        strmw,
+                        total_bits // intfw,
+                        "dma2dwc",
+                        "out_" + self.hls_sname(),
+                    ),
                 ]
             else:
                 # case 2: AXI MM width not divisible by out width or vice versa
@@ -271,26 +273,40 @@ def docompute(self):
                 self.code_gen_dict["$DOCOMPUTE$"] = [
                     "hls::stream<ap_uint<%d> > dma2lcm;" % intfw,
                     "hls::stream<ap_uint<%d> > lcm2out;" % width_lcm,
-                    dma_inst_template % ("in0", "dma2lcm"),
+                    dma_inst_template % ("in0_" + self.hls_sname(), "dma2lcm"),
                     dwc_inst_template
                     % (intfw, width_lcm, total_bits // intfw, "dma2lcm", "lcm2out"),
                     dwc_inst_template
-                    % (width_lcm, strmw, total_bits // width_lcm, "lcm2out", "out"),
+                    % (
+                        width_lcm,
+                        strmw,
+                        total_bits // width_lcm,
+                        "lcm2out",
+                        "out_" + self.hls_sname(),
+                    ),
                 ]
         elif direction == "out":
             # in0 -> (DWCs) -> IODMA -> AXI MM
             # DWCs depend on AXI MM and out interface width
             if strmw == intfw:
                 # case 0: in width = AXI MM width, no DWCs needed
-                self.code_gen_dict["$DOCOMPUTE$"] = [dma_inst_template % ("in0", "out")]
+                self.code_gen_dict["$DOCOMPUTE$"] = [
+                    dma_inst_template % ("in0_" + self.hls_sname(), "out_" + self.hls_sname())
+                ]
             elif (strmw % intfw == 0) or (intfw % strmw == 0):
                 # case 1: AXI MM width divisible by in width or vice versa
                 # single DWC + single extra stream needed
                 self.code_gen_dict["$DOCOMPUTE$"] = [
                     "hls::stream<ap_uint<%d> > dwc2dma;" % intfw,
                     dwc_inst_template
-                    % (strmw, intfw, total_bits // strmw, "in0", "dwc2dma"),
-                    dma_inst_template % ("dwc2dma", "out"),
+                    % (
+                        strmw,
+                        intfw,
+                        total_bits // strmw,
+                        "in0_" + self.hls_sname(),
+                        "dwc2dma",
+                    ),
+                    dma_inst_template % ("dwc2dma", "out_" + self.hls_sname()),
                 ]
             else:
                 # case 2: AXI MM width not divisible by out width or vice versa
@@ -300,10 +316,16 @@ def docompute(self):
                     "hls::stream<ap_uint<%d> > in2lcm;" % width_lcm,
                     "hls::stream<ap_uint<%d> > lcm2dma;" % intfw,
                     dwc_inst_template
-                    % (strmw, width_lcm, total_bits // strmw, "in0", "in2lcm"),
+                    % (
+                        strmw,
+                        width_lcm,
+                        total_bits // strmw,
+                        "in0_" + self.hls_sname(),
+                        "in2lcm",
+                    ),
                     dwc_inst_template
                     % (width_lcm, intfw, total_bits // width_lcm, "in2lcm", "lcm2dma"),
-                    dma_inst_template % ("lcm2dma", "out"),
+                    dma_inst_template % ("lcm2dma", "out_" + self.hls_sname()),
                 ]
         else:
             raise Exception("Unknown IODMA direction: %s" % direction)
@@ -316,13 +338,25 @@ def blackboxfunction(self):
         direction = self.get_nodeattr("direction")
         if direction == "in":
             self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
-                "void %s(%s *in0, hls::stream<%s > &out, unsigned int numReps)"
-                % (self.onnx_node.name, packed_hls_type_in, packed_hls_type_out)
+                "void %s(%s *in0_%s, hls::stream<%s > &out_%s, unsigned int numReps)"
+                % (
+                    self.onnx_node.name,
+                    packed_hls_type_in,
+                    self.hls_sname(),
+                    packed_hls_type_out,
+                    self.hls_sname(),
+                )
             ]
         elif direction == "out":
             self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
-                "void %s(hls::stream<%s > &in0, %s *out, unsigned int numReps)"
-                % (self.onnx_node.name, packed_hls_type_in, packed_hls_type_out)
+                "void %s(hls::stream<%s > &in0_%s, %s *out_%s, unsigned int numReps)"
+                % (
+                    self.onnx_node.name,
+                    packed_hls_type_in,
+                    self.hls_sname(),
+                    packed_hls_type_out,
+                    self.hls_sname(),
+                )
             ]
         else:
             raise ValueError("Invalid IODMA direction, please set to in or out")
@@ -339,32 +373,32 @@ def pragmas(self):
         if direction == "in":
             if intfname == "":
                 self.code_gen_dict["$PRAGMAS$"].append(
-                    "#pragma HLS INTERFACE m_axi offset=slave port=in0"
+                    "#pragma HLS INTERFACE m_axi offset=slave port=in0_" + self.hls_sname()
                 )
             else:
                 self.code_gen_dict["$PRAGMAS$"].append(
                     "#pragma HLS INTERFACE m_axi offset=slave port=%s" % (intfname)
                 )
             self.code_gen_dict["$PRAGMAS$"].append(
-                "#pragma HLS INTERFACE s_axilite port=in0 bundle=control"
+                "#pragma HLS INTERFACE s_axilite port=in0_%s bundle=control" % (self.hls_sname())
             )
             self.code_gen_dict["$PRAGMAS$"].append(
-                "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname()
+                "#pragma HLS INTERFACE axis port=out_" + self.hls_sname()
             )
         elif direction == "out":
             self.code_gen_dict["$PRAGMAS$"].append(
-                "#pragma HLS INTERFACE axis port=in0 name=in0_" + self.hls_sname()
+                "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname()
             )
             if intfname == "":
                 self.code_gen_dict["$PRAGMAS$"].append(
-                    "#pragma HLS INTERFACE m_axi offset=slave port=out"
+                    "#pragma HLS INTERFACE m_axi offset=slave port=out_" + self.hls_sname()
                 )
             else:
                 self.code_gen_dict["$PRAGMAS$"].append(
                     "#pragma HLS INTERFACE m_axi offset=slave port=%s" % (intfname)
                 )
             self.code_gen_dict["$PRAGMAS$"].append(
-                "#pragma HLS INTERFACE s_axilite port=out bundle=control"
+                "#pragma HLS INTERFACE s_axilite port=out_%s bundle=control" % (self.hls_sname())
             )
         else:
             raise ValueError("Invalid IODMA direction, please set to in or out")
diff --git a/src/finn/custom_op/fpgadataflow/labelselect_batch.py b/src/finn/custom_op/fpgadataflow/labelselect_batch.py
index 3e27ee0111..60d3eb9154 100644
--- a/src/finn/custom_op/fpgadataflow/labelselect_batch.py
+++ b/src/finn/custom_op/fpgadataflow/labelselect_batch.py
@@ -39,8 +39,8 @@
 class LabelSelect_Batch(HLSCustomOp):
     """Class that corresponds to finn-hlslib LabelSelect_Batch function."""
 
-    def __init__(self, onnx_node):
-        super().__init__(onnx_node)
+    def __init__(self, onnx_node, **kwargs):
+        super().__init__(onnx_node, **kwargs)
         odt_name = self.get_nodeattr("outputDataType")
         if odt_name == "":
             # If not provided compute min size
@@ -70,13 +70,13 @@ def get_nodeattr_types(self):
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs
 
-    def get_normal_input_shape(self):
+    def get_normal_input_shape(self, ind=0):
         nlabels = self.get_nodeattr("Labels")
         vecs = list(self.get_nodeattr("numInputVectors"))
         ishape = tuple(vecs + [nlabels])
         return ishape
 
-    def get_folded_input_shape(self):
+    def get_folded_input_shape(self, ind=0):
         nlabels = self.get_nodeattr("Labels")
         pe = self.get_nodeattr("PE")
         vecs = list(self.get_nodeattr("numInputVectors"))
@@ -85,13 +85,13 @@ def get_folded_input_shape(self):
         folded_ishape = tuple(vecs + [folds, pe])
         return folded_ishape
 
-    def get_normal_output_shape(self):
+    def get_normal_output_shape(self, ind=0):
         k = self.get_nodeattr("K")
         vecs = list(self.get_nodeattr("numInputVectors"))
         oshape = tuple(vecs + [k])
         return oshape
 
-    def get_folded_output_shape(self):
+    def get_folded_output_shape(self, ind=0):
         k = self.get_nodeattr("K")
         vecs = list(self.get_nodeattr("numInputVectors"))
         oshape = tuple(vecs + [k, 1])
@@ -141,9 +141,7 @@ def verify_node(self):
             self.get_nodeattr("outputDataType")
             info_messages.append("All necessary attributes exist")
         except Exception:
-            info_messages.append(
-                """The required LabelSelect_Batch attributes do not exist."""
-            )
+            info_messages.append("""The required LabelSelect_Batch attributes do not exist.""")
 
         # verify that input data is 1D
         if len(self.get_nodeattr("numInputVectors")) > 1:
@@ -152,24 +150,24 @@ def verify_node(self):
 
         return info_messages
 
-    def get_input_datatype(self):
+    def get_input_datatype(self, ind=0):
         """Returns FINN DataType of input."""
         ret = DataType[self.get_nodeattr("inputDataType")]
         return ret
 
-    def get_output_datatype(self):
+    def get_output_datatype(self, ind=0):
         """Returns FINN DataType of output."""
         ret = DataType[self.get_nodeattr("outputDataType")]
         return ret
 
-    def get_instream_width(self):
+    def get_instream_width(self, ind=0):
         """Returns input stream width."""
         ibits = self.get_input_datatype().bitwidth()
         pe = self.get_nodeattr("PE")
         in_width = pe * ibits
         return in_width
 
-    def get_outstream_width(self):
+    def get_outstream_width(self, ind=0):
         """Returns output stream width."""
         return self.get_output_datatype().bitwidth()
 
@@ -275,29 +273,42 @@ def read_npy_data(self):
         # Also notice that StreamingDataWidthConverter_Batch performs LE packing
 
         self.code_gen_dict["$READNPYDATA$"].append(
-            'npy2apintstream<%s, %s, %d, %s>("%s", in0,false);'
-            % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in)
+            'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s, false);'
+            % (
+                packed_hls_type,
+                elem_hls_type,
+                elem_bits,
+                npy_type,
+                npy_in,
+                self.hls_sname(),
+            )
         )
 
     def strm_decl(self):
         self.code_gen_dict["$STREAMDECLARATIONS$"] = []
         self.code_gen_dict["$STREAMDECLARATIONS$"].append(
-            'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.get_instream_width())
+            'hls::stream<ap_uint<{}>> in0_{} ("in0_{}");'.format(
+                self.get_instream_width(), self.hls_sname(), self.hls_sname()
+            )
         )
         self.code_gen_dict["$STREAMDECLARATIONS$"].append(
-            'hls::stream<ap_uint<{}>> out ("out");'.format(self.get_outstream_width())
+            'hls::stream<ap_uint<{}>> out_{} ("out_{}");'.format(
+                self.get_outstream_width(), self.hls_sname(), self.hls_sname()
+            )
         )
 
     def docompute(self):
         node = self.onnx_node
         self.code_gen_dict["$DOCOMPUTE$"] = [
-            """{}<{}, {}, {}, {}, {} > (in0, out, 1);""".format(
+            """{}<{}, {}, {}, {}, {} > (in0_{}, out_{}, 1);""".format(
                 node.op_type,
                 self.get_nodeattr("Labels"),
                 self.get_nodeattr("PE"),
                 self.get_nodeattr("K"),
                 self.get_input_datatype().get_hls_datatype_str(),
                 self.get_output_datatype().get_hls_datatype_str(),
+                self.hls_sname(),
+                self.hls_sname(),
             )
         ]
 
@@ -314,12 +325,13 @@ def dataoutstrm(self):
         oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}")
 
         self.code_gen_dict["$DATAOUTSTREAM$"] = [
-            'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s");'
+            'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s");'
             % (
                 packed_hls_type,
                 elem_hls_type,
                 elem_bits,
                 npy_type,
+                self.hls_sname(),
                 oshape_cpp_str,
                 npy_out,
             )
@@ -330,25 +342,25 @@ def save_as_npy(self):
 
     def blackboxfunction(self):
         self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
-            """void {}(hls::stream<ap_uint<{}*{}>> &in0,
-                hls::stream<ap_uint<{}> > &out)""".format(
+            """void {}(hls::stream<ap_uint<{}*{}>> &in0_{},
+                hls::stream<ap_uint<{}> > &out_{})""".format(
                 self.onnx_node.name,
                 self.get_nodeattr("PE"),
                 self.get_input_datatype().bitwidth(),
+                self.hls_sname(),
                 self.get_output_datatype().bitwidth(),
+                self.hls_sname(),
             )
         ]
 
     def pragmas(self):
         self.code_gen_dict["$PRAGMAS$"] = [
-            "#pragma HLS INTERFACE axis port=in0 name=in0_" + self.hls_sname()
+            "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname()
         ]
         self.code_gen_dict["$PRAGMAS$"].append(
-            "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname()
-        )
-        self.code_gen_dict["$PRAGMAS$"].append(
-            "#pragma HLS INTERFACE ap_ctrl_none port=return"
+            "#pragma HLS INTERFACE axis port=out_" + self.hls_sname()
         )
+        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return")
 
     def get_exp_cycles(self):
         nlabels = self.get_nodeattr("Labels")
diff --git a/src/finn/custom_op/fpgadataflow/lookup.py b/src/finn/custom_op/fpgadataflow/lookup.py
index d90fa0f05a..2dfca90ed9 100644
--- a/src/finn/custom_op/fpgadataflow/lookup.py
+++ b/src/finn/custom_op/fpgadataflow/lookup.py
@@ -44,8 +44,8 @@
 class Lookup(HLSCustomOp):
     "Streaming elementwise HLS lookup, mapping indices to values."
 
-    def __init__(self, onnx_node):
-        super().__init__(onnx_node)
+    def __init__(self, onnx_node, **kwargs):
+        super().__init__(onnx_node, **kwargs)
 
     def get_nodeattr_types(self):
         my_attrs = {
@@ -75,21 +75,21 @@ def get_exp_cycles(self):
         exp_cycles = int(n_inputs)
         return exp_cycles
 
-    def get_normal_input_shape(self):
+    def get_normal_input_shape(self, ind=0):
         return self.get_nodeattr("InputShape")
 
-    def get_normal_output_shape(self):
+    def get_normal_output_shape(self, ind=0):
         ishape = self.get_normal_input_shape()
         emb_dim = self.get_nodeattr("EmbeddingDim")
         oshape = list(ishape) + [emb_dim]
         return tuple(oshape)
 
-    def get_folded_input_shape(self):
+    def get_folded_input_shape(self, ind=0):
         ishape = self.get_normal_input_shape()
         folded_ishape = list(ishape) + [1]
         return tuple(folded_ishape)
 
-    def get_folded_output_shape(self):
+    def get_folded_output_shape(self, ind=0):
         ishape = self.get_normal_input_shape()
         mem_mode = self.get_nodeattr("mem_mode")
         emb_dim = self.get_nodeattr("EmbeddingDim")
@@ -135,19 +135,19 @@ def infer_node_datatype(self, model):
     def verify_node(self):
         pass
 
-    def get_input_datatype(self):
+    def get_input_datatype(self, ind=0):
         ret = DataType[self.get_nodeattr("InputType")]
         return ret
 
-    def get_output_datatype(self):
+    def get_output_datatype(self, ind=0):
         ret = DataType[self.get_nodeattr("EmbeddingType")]
         return ret
 
-    def get_instream_width(self):
+    def get_instream_width(self, ind=0):
         ibits = self.get_input_datatype().bitwidth()
         return ibits
 
-    def get_outstream_width(self):
+    def get_outstream_width(self, ind=0):
         folded_oshape = self.get_folded_output_shape()
         obits = self.get_output_datatype().bitwidth()
         return obits * folded_oshape[-1]
@@ -159,8 +159,8 @@ def get_number_output_values(self):
     def global_includes(self):
         mem_mode = self.get_nodeattr("mem_mode")
         global_incls = []
+        global_incls.append('#include "lookup.hpp"')
         if mem_mode == "const":
-            global_incls.append('#include "lookup.hpp"')
             global_incls.append('#include "embeddings.hpp"')
         self.code_gen_dict["$GLOBALS$"] = global_incls
 
@@ -184,9 +184,7 @@ def defines(self, var):
             my_defines.append("#define T_SRC %s" % elem_hls_type)
             my_defines.append("#define T_DST ap_uint<MemBits>")
         elif mem_mode == "const":
-            my_defines.append(
-                "#define NumEmbeddings %d" % self.get_nodeattr("NumEmbeddings")
-            )
+            my_defines.append("#define NumEmbeddings %d" % self.get_nodeattr("NumEmbeddings"))
             my_defines.append("#define EmbeddingDim %d" % emb_dim)
             my_defines.append("#define InputType %s" % elem_hls_type)
             my_defines.append("#define EmbeddingType %s" % emb_hls_type)
@@ -206,8 +204,15 @@ def read_npy_data(self):
         npy_in = "%s/input_0.npy" % code_gen_dir
         self.code_gen_dict["$READNPYDATA$"] = []
         self.code_gen_dict["$READNPYDATA$"].append(
-            'npy2apintstream<%s, %s, %d, %s>("%s", in0);'
-            % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in)
+            'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s);'
+            % (
+                packed_hls_type,
+                elem_hls_type,
+                elem_bits,
+                npy_type,
+                npy_in,
+                self.hls_sname(),
+            )
         )
 
     def dataoutstrm(self):
@@ -226,12 +231,13 @@ def dataoutstrm(self):
         oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}")
 
         self.code_gen_dict["$DATAOUTSTREAM$"] = [
-            'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s", %s);'
+            'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s", %s);'
             % (
                 packed_hls_type,
                 elem_hls_type,
                 elem_bits,
                 npy_type,
+                self.hls_sname(),
                 oshape_cpp_str,
                 npy_out,
                 "false",
@@ -244,10 +250,14 @@ def save_as_npy(self):
     def strm_decl(self):
         self.code_gen_dict["$STREAMDECLARATIONS$"] = []
         self.code_gen_dict["$STREAMDECLARATIONS$"].append(
-            'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.get_instream_width())
+            'hls::stream<ap_uint<{}>> in0_{} ("in0_{}");'.format(
+                self.get_instream_width(), self.hls_sname(), self.hls_sname()
+            )
         )
         self.code_gen_dict["$STREAMDECLARATIONS$"].append(
-            'hls::stream<ap_uint<{}>> out ("out");'.format(self.get_outstream_width())
+            'hls::stream<ap_uint<{}>> out_{} ("out_{}");'.format(
+                self.get_outstream_width(), self.hls_sname(), self.hls_sname()
+            )
         )
 
     def docompute(self):
@@ -255,20 +265,15 @@ def docompute(self):
         if mem_mode == "const":
             self.code_gen_dict["$DOCOMPUTE$"] = [
                 """StreamingLookup<NumEmbeddings,  EmbeddingDim, NumInputs,
-                InputType, EmbeddingType >(in0, out, embeddings);"""
+                InputType, EmbeddingType >(in0_%s, out_%s, embeddings);"""
+                % (self.hls_sname(), self.hls_sname())
             ]
         elif mem_mode == "external":
-            hls_impl = """
-    if(!in0.empty()) {
-        ap_uint<T_SRC::width+EmbeddingAlign> const  base =
-            (in0.read(), ap_uint<EmbeddingAlign>(0));
-        for(unsigned  j = 0; j < EmbeddingSize; j++) {
-#pragma HLS PIPELINE II=1
-            out.write(mem[base+j]);
-        }
-    }
-            """
-            self.code_gen_dict["$DOCOMPUTE$"] = [hls_impl]
+            self.code_gen_dict["$DOCOMPUTE$"] = [
+                """StreamingLookup_ext<EmbeddingSize>(in0_%s, out_%s, mem, size, oob_count,
+                oob_irq);"""
+                % (self.hls_sname(), self.hls_sname())
+            ]
 
     def blackboxfunction(self):
         mem_mode = self.get_nodeattr("mem_mode")
@@ -278,33 +283,38 @@ def blackboxfunction(self):
         packed_output_hls_type = "ap_uint<%d>" % obits
         if mem_mode == "const":
             self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
-                "void %s(hls::stream<%s > &in0, hls::stream<%s > &out)"
-                % (self.onnx_node.name, packed_input_hls_type, packed_output_hls_type)
+                "void %s(hls::stream<%s > &in0_%s, hls::stream<%s > &out_%s)"
+                % (
+                    self.onnx_node.name,
+                    packed_input_hls_type,
+                    self.hls_sname(),
+                    packed_output_hls_type,
+                    self.hls_sname(),
+                )
             ]
         elif mem_mode == "external":
             self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
                 "void "
                 + self.onnx_node.name
-                + "(hls::stream<T_SRC> &in0, hls::stream<T_DST> &out, "
-                + "T_DST const *const  mem)"
+                + "(hls::stream<T_SRC> &in0_%s, hls::stream<T_DST> &out_%s, "
+                % (self.hls_sname(), self.hls_sname())
+                + "T_DST const *const  mem, unsigned const size, "
+                + "unsigned &oob_count, bool &oob_irq)"
             ]
 
     def pragmas(self):
         mem_mode = self.get_nodeattr("mem_mode")
-        my_pragmas = [
-            "#pragma HLS INTERFACE axis port=in0 name=in0_" + self.hls_sname()
-        ]
-        my_pragmas.append(
-            "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname()
-        )
+        my_pragmas = ["#pragma HLS INTERFACE axis port=in0_" + self.hls_sname()]
+        my_pragmas.append("#pragma HLS INTERFACE axis port=out_" + self.hls_sname())
         my_pragmas.append("#pragma HLS INTERFACE ap_ctrl_none port=return")
         if mem_mode == "const":
-            my_pragmas.append(
-                "#pragma HLS BIND_STORAGE variable=embeddings type=ROM_2P impl=BRAM"
-            )
+            my_pragmas.append("#pragma HLS BIND_STORAGE variable=embeddings type=ROM_2P impl=BRAM")
         elif mem_mode == "external":
             my_pragmas.append("#pragma HLS INTERFACE m_axi offset=slave port=mem")
             my_pragmas.append("#pragma HLS INTERFACE s_axilite port=mem bundle=control")
+            my_pragmas.append("#pragma HLS INTERFACE s_axilite port=size bundle=control")
+            my_pragmas.append("#pragma HLS INTERFACE s_axilite port=oob_count bundle=control")
+            my_pragmas.append("#pragma HLS INTERFACE ap_none port=oob_irq")
         else:
             raise Exception("Unrecognized mem_mode: " + mem_mode)
         self.code_gen_dict["$PRAGMAS$"] = my_pragmas
@@ -324,9 +334,7 @@ def generate_params(self, model, path):
             # reverse innertmost dim in embeddings to remain compatible with
             # how we normally encode the data in FINN
             embeddings_rev = np.flip(embeddings, -1)
-            embeddings_hls_code = numpy_to_hls_code(
-                embeddings_rev, edt, "embeddings", True, False
-            )
+            embeddings_hls_code = numpy_to_hls_code(embeddings_rev, edt, "embeddings", True, False)
             f_thresh = open(weight_filename, "w")
             f_thresh.write(embeddings_hls_code)
             f_thresh.close()
@@ -348,9 +356,7 @@ def generate_params(self, model, path):
             pad_amount = align_factor - emb_dim
             embeddings_padded = np.pad(embeddings, [(0, 0), (0, pad_amount)])
             # reshape for packing the innermost dim
-            embeddings_padded = embeddings_padded.reshape(
-                -1, emb_elems_per_ext_mem_width
-            )
+            embeddings_padded = embeddings_padded.reshape(-1, emb_elems_per_ext_mem_width)
             weight_filename = "%s/%s.dat" % (path, self.onnx_node.name)
             ret = pack_innermost_dim_as_hex_string(
                 embeddings_padded, edt, ext_mem_width, True, prefix=""
@@ -475,4 +481,5 @@ def get_verilog_top_module_intf_names(self):
         if mem_mode == "external":
             intf_names["axilite"] = ["s_axi_control"]
             intf_names["aximm"] = [("m_axi_gmem", self.get_nodeattr("ext_mem_width"))]
+            intf_names["ap_none"] = ["oob_irq"]
         return intf_names
diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py
index 9d2717dc8c..6699340cac 100644
--- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py
+++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py
@@ -46,8 +46,6 @@
     rtlsim_output_to_npy,
 )
 
-from . import templates
-
 # ONNX i/o tensor shape assumptions for MatrixVectorActivation:
 # input 0 is the input tensor, shape (.., i_size) = (..., MW)
 # input 1 is the weight tensor, shape (i_size, o_size) = (MW, MH)
@@ -60,9 +58,8 @@ class MatrixVectorActivation(HLSCustomOp):
     """Class that corresponds to finn-hls Matrix_Vector_Activate(_Stream)_Batch
     function."""
 
-    def __init__(self, onnx_node):
-        super().__init__(onnx_node)
-        self.decoupled_wrapper = templates.decoupled_wrapper
+    def __init__(self, onnx_node, **kwargs):
+        super().__init__(onnx_node, **kwargs)
 
     def get_nodeattr_types(self):
         my_attrs = {
@@ -192,9 +189,7 @@ def verify_node(self):
             self.get_nodeattr("outputDataType")
             info_messages.append("All necessary attributes exist")
         except Exception:
-            info_messages.append(
-                """The required MatrixVectorActivation attributes do not exist."""
-            )
+            info_messages.append("""The required MatrixVectorActivation attributes do not exist.""")
 
         # verify the number of inputs depending on noActivation value
         # check noActivation value to determine the number of inputs
@@ -350,22 +345,30 @@ def lut_estimation(self):
         # adder tree
         addertree_luts = (W + A) * (2 * Q - 1)
         # accumulator
-        acc_bits = W + A + np.ceil(math.log(MW, 2))
+        acc_datatype = self.get_accumulator_datatype()
+        # if accDataType is not set, then it will default to INT32, which would
+        # be a large overestimate in most (if not all) cases. In this scenario,
+        # we would use the minimum accumulator as determined by the data types
+        # bound, derived in https://arxiv.org/abs/2301.13376
+        alpha = math.log(MW, 2) + W + A - 1 - int(idt.signed())
+        acc_bits = min(
+            acc_datatype.bitwidth(),
+            np.ceil(alpha + math.log(1 + pow(2, -alpha), 2) + 1),
+        )
         acc_luts = acc_bits
         # thresholds and threshold comparators
         thr_luts = 0
         comp_luts = 0
         noact = self.get_nodeattr("noActivation")
-        if noact == 0:
+        tmem_style = self.get_nodeattr("ram_style_thresholds")
+        if (noact == 0) and (tmem_style == "distributed"):
             odt = self.get_output_datatype()
             B = odt.bitwidth()
             thr_luts = (2**B - 1) * acc_bits * math.ceil(self.calc_tmem() / 64)
             comp_luts = (2**B - 1) * acc_bits
 
         return int(
-            c0
-            + c1 * (P * (mult_luts + addertree_luts + acc_luts + thr_luts + comp_luts))
-            + c2
+            c0 + c1 * (P * (mult_luts + addertree_luts + acc_luts + thr_luts + comp_luts)) + c2
         )
 
     def dsp_estimation(self):
@@ -405,20 +408,24 @@ def get_input_datatype(self, ind=0):
         else:
             raise Exception("Undefined input ind for this layer type")
 
+    def get_accumulator_datatype(self):
+        """Returns FINN DataType of accumulator"""
+        return DataType[self.get_nodeattr("accDataType")]
+
     def get_weight_datatype(self):
         """Returns FINN DataType of weights."""
         return DataType[self.get_nodeattr("weightDataType")]
 
-    def get_output_datatype(self):
+    def get_output_datatype(self, ind=0):
         """Returns FINN DataType of output."""
         return DataType[self.get_nodeattr("outputDataType")]
 
-    def get_instream_width(self):
+    def get_instream_width(self, ind=0):
         i_bits = self.get_input_datatype().bitwidth()
         in_width = i_bits * self.get_nodeattr("SIMD")
         return in_width
 
-    def get_outstream_width(self):
+    def get_outstream_width(self, ind=0):
         o_bits = self.get_output_datatype().bitwidth()
         out_width = o_bits * self.get_nodeattr("PE")
         return out_width
@@ -474,7 +481,7 @@ def get_folded_input_shape(self, ind=0):
 
         return folded_input_shape
 
-    def get_folded_output_shape(self):
+    def get_folded_output_shape(self, ind=0):
         mh = self.get_nodeattr("MH")
         pe = self.get_nodeattr("PE")
         nf = mh // pe
@@ -482,13 +489,13 @@ def get_folded_output_shape(self):
         folded_output_shape = tuple(vecs + [nf, pe])
         return folded_output_shape
 
-    def get_normal_input_shape(self):
+    def get_normal_input_shape(self, ind=0):
         mw = self.get_nodeattr("MW")
         vecs = list(self.get_nodeattr("numInputVectors"))
         normal_input_shape = tuple(vecs + [mw])
         return normal_input_shape
 
-    def get_normal_output_shape(self):
+    def get_normal_output_shape(self, ind=0):
         mh = self.get_nodeattr("MH")
         vecs = list(self.get_nodeattr("numInputVectors"))
         normal_output_shape = tuple(vecs + [mh])
@@ -575,67 +582,101 @@ def get_hls_compatible_weight_tensor(self, orig_weight_matrix):
         return ret
 
     def minimize_accumulator_width(self, model):
+        """Minimize the accumulator bit width according to the weight values,
+        input data types, and size of dot product"""
         weights = model.get_initializer(self.onnx_node.input[1])
+        # since in the calculation the values of the weight matrix are used,
+        # for the bipolar case they need to be converted to bipolar
+        if self.get_nodeattr("binaryXnorMode"):
+            weights = 2 * weights - 1
+
+        thresholds = None
         if len(self.onnx_node.input) > 2:
             thresholds = model.get_initializer(self.onnx_node.input[2])
-        else:
-            thresholds = None
+
         idt = self.get_input_datatype()
-        # calculate minimum and maximum values of accumulator
+
         (acc_min, acc_max) = calculate_matvec_accumulator_range(weights, idt)
+        # if runtime-writeable weights, then the values of the weights can
+        # change and we need to use the worst-case values from the datatypes
+        if self.get_nodeattr("runtime_writeable_weights"):
+            wdt = self.get_weight_datatype()
+            lower_worst = wdt.min() * np.ones_like(weights)
+            lower_range = calculate_matvec_accumulator_range(lower_worst, idt)
+            upper_worst = wdt.max() * np.ones_like(weights)
+            upper_range = calculate_matvec_accumulator_range(upper_worst, idt)
+            acc_min = min(min(lower_range), min(upper_range))
+            acc_max = max(max(upper_range), max(upper_range))
+
+        # if the thresholds can be used to determine range, then adjust the range
+        # according to the known values of the thresholds
         if thresholds is not None:
             threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds)
             # set threshold datatype (and accumulator datatype implicitly)
             min_threshold = thresholds.min()
             max_threshold = thresholds.max()
             # clip threshold values
-            clip_upper = None
-            clip_lower = None
-            if max_threshold > acc_max + 1:
-                clip_upper = acc_max + 1
-            if min_threshold < acc_min:
-                clip_lower = acc_min
-            if (clip_lower is not None) or (clip_upper is not None):
+            if max_threshold > acc_max or min_threshold < acc_min:
                 warnings.warn("Clipping some thresholds in %s" % self.onnx_node.name)
-                thresholds = np.clip(thresholds, clip_lower, clip_upper)
+                thresholds = np.clip(thresholds, acc_min, acc_max)
                 model.set_initializer(self.onnx_node.input[2], thresholds)
                 threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds)
                 min_threshold = thresholds.min()
                 max_threshold = thresholds.max()
-            # get range required by threshold values
-            tdt_min = min(acc_min, min_threshold)
-            tdt_max = max(acc_max, max_threshold)
-            if tdt_min < 0:
-                if abs(tdt_min) > tdt_max:
-                    tdt = DataType.get_smallest_possible(tdt_min)
-                else:
-                    tdt = DataType.get_smallest_possible(-tdt_max - 1)
-            else:
-                tdt = DataType.get_smallest_possible(tdt_max)
-            assert np.vectorize(tdt.allowed)(
+            acc_min = min(min_threshold, acc_min)
+            acc_max = max(max_threshold, acc_max)
+
+        # if the acc_range is always greater than 0, then acc_max <= 2^P - 1
+        if acc_min >= 0:
+            acc_bit_width = np.log2(acc_max + 1)
+            acc_bit_width = math.ceil(acc_bit_width)
+            adt = DataType[f"UINT{acc_bit_width}"]
+        # if the acc_range is signed, then acc_min >= -2^{P-1} and acc_max <=
+        # 2^{P - 1} - 1, which means 2^{P - 1} >= max(-acc_min, 1 + acc_max)
+        else:
+            _acc_max = max(-acc_min, 1 + acc_max)
+            acc_bit_width = np.log2(_acc_max) + 1
+            acc_bit_width = math.ceil(acc_bit_width)
+            adt = DataType[f"INT{acc_bit_width}"]
+
+        # if activation, assert that the thresholds can be expressed with adt
+        if thresholds is not None:
+            assert np.vectorize(adt.allowed)(
                 threshold_tensor
             ).all(), "Thresholds in %s can't be expressed with type %s" % (
                 self.onnx_node.name,
-                str(tdt),
+                str(adt),
             )
-            self.set_nodeattr("accDataType", tdt.name)
-        else:
-            if acc_min < 0:
-                if abs(acc_min) > acc_max:
-                    adt = DataType.get_smallest_possible(acc_min)
-                else:
-                    adt = DataType.get_smallest_possible(-acc_max - 1)
-            else:
-                adt = DataType.get_smallest_possible(acc_max)
-            # ensure a datatype divisible by 8-bits in case this is the last node
-            bw = roundup_to_integer_multiple(adt.bitwidth(), 8)
-            new_adt_name = adt.name.replace(str(adt.bitwidth()), str(bw))
-            adt = DataType[new_adt_name]
-            self.set_nodeattr("accDataType", adt.name)
+
+        # if no activation, output and accumulator datatypes are the same
+        if self.get_nodeattr("noActivation"):
+            # if this is the last node in the graph, then ensure the datatype is
+            # divisibly by 8 bits
+            if model.find_direct_successors(self.onnx_node) is None:
+                bw = roundup_to_integer_multiple(adt.bitwidth(), 8)
+                new_adt_name = adt.name.replace(str(adt.bitwidth()), str(bw))
+                adt = DataType[new_adt_name]
             # for no-activation nodes, output dt = acc dt
             self.set_nodeattr("outputDataType", adt.name)
+        self.set_nodeattr("accDataType", adt.name)
         return DataType[self.get_nodeattr("accDataType")]
 
+    def minimize_weight_bit_width(self, model):
+        """Minimize the bit width based on the values of the weights"""
+        if not self.get_nodeattr("runtime_writeable_weights"):
+            weights = model.get_initializer(self.onnx_node.input[1])
+            w_min = weights.min()
+            w_max = weights.max()
+            if w_min < 0:
+                if abs(w_min) > w_max:
+                    wdt = DataType.get_smallest_possible(w_min)
+                else:
+                    wdt = DataType.get_smallest_possible(-w_max - 1)
+            else:
+                wdt = DataType.get_smallest_possible(w_max)
+            self.set_nodeattr("weightDataType", wdt.name)
+        return DataType[self.get_nodeattr("weightDataType")]
+
     def get_hls_compatible_threshold_tensor(self, orig_thres_matrix):
         """Convert the original numpy weight matrix orig_weight_matrix into
         a form suitable for passing to the hlslib call:
@@ -667,19 +708,10 @@ def get_hls_compatible_threshold_tensor(self, orig_thres_matrix):
             # ensure all thresholds are integer
             assert (orig_thres_matrix.astype(np.int32) == orig_thres_matrix).all()
         ret = orig_thres_matrix
-        # workaround for vivado_hls threshold bug
-        if ret[0][0] == 0 and n_thres_steps == 1:
-            ret = np.copy(ret)
-            ret[0][0] = 1
-            warnings.warn(
-                "Setting 0-valued first threshold to 1 to avoid vivado_hls bug"
-            )
         # ensure channels = mh , duplicating if necessary
         if ret.shape[0] == 1:
             ret = np.tile(ret, (mh, 1))
-        assert (
-            ret.shape[0] == mh
-        ), "Channels of threshold matrix are not as expected (mh)"
+        assert ret.shape[0] == mh, "Channels of threshold matrix are not as expected (mh)"
         # distribute rows between PEs
         ret = interleave_matrix_outer_dim_from_partitions(ret, pe)
         assert (
@@ -702,10 +734,12 @@ def make_weight_file(self, weights, weight_file_mode, weight_file_name):
         of weights.
 
         Arguments:
+
         * weights : numpy array with weights to be put into the file
         * weight_file_mode : one of {hls_header, decoupled_verilog_dat,
           decoupled_runtime}
         * weight_file_name : filename for the weight file to be generated
+
         """
         # convert weights into hlslib-compatible format
         weight_tensor = self.get_hls_compatible_weight_tensor(weights)
@@ -715,9 +749,7 @@ def make_weight_file(self, weights, weight_file_mode, weight_file_name):
         if self.get_weight_datatype() == DataType["BIPOLAR"]:
             export_wdt = DataType["BINARY"]
         if weight_file_mode == "hls_header":
-            weight_hls_code = numpy_to_hls_code(
-                weight_tensor, export_wdt, "weights", True, True
-            )
+            weight_hls_code = numpy_to_hls_code(weight_tensor, export_wdt, "weights", True, True)
             # write weights into C++ header file as dictated by finn-hlslib
             f_weights = open(weight_file_name, "w")
             if export_wdt.bitwidth() != 1:
@@ -751,14 +783,10 @@ def make_weight_file(self, weights, weight_file_mode, weight_file_name):
             pe = self.get_nodeattr("PE")
             simd = self.get_nodeattr("SIMD")
             # simd_flipped
-            weight_tensor_simd_flipped = weight_tensor_simd_flipped.reshape(
-                1, -1, pe * simd
-            )
+            weight_tensor_simd_flipped = weight_tensor_simd_flipped.reshape(1, -1, pe * simd)
             weight_tensor_simd_flipped = weight_tensor_simd_flipped.copy()
             # flipped
-            weight_tensor_pe_flipped = weight_tensor_pe_flipped.reshape(
-                1, -1, pe * simd
-            )
+            weight_tensor_pe_flipped = weight_tensor_pe_flipped.reshape(1, -1, pe * simd)
             weight_tensor_pe_flipped = weight_tensor_pe_flipped.copy()
             if weight_file_mode == "decoupled_npy":
                 # save weight stream into npy for cppsim
@@ -819,29 +847,9 @@ def generate_params(self, model, path):
             self.make_weight_file(weights, "decoupled_npy", weight_filename_sim)
             if mem_mode == "decoupled":
                 # also save weights as Verilog .dat file
-                # note that we provide two different .dat files, one for synth
-                # and one for synthesis. this is because URAM-based weights always
-                # need zero weights for synthesis, otherwise they get inferred
-                # as BRAM
-                weight_filename_rtl_synth = "{}/memblock_synth_0.dat".format(
-                    code_gen_dir
-                )
-                weight_filename_rtl_sim = "{}/memblock_sim_0.dat".format(code_gen_dir)
-                # sim weights are always the true weights
-                self.make_weight_file(
-                    weights, "decoupled_verilog_dat", weight_filename_rtl_sim
-                )
-                ram_style = self.get_nodeattr("ram_style")
-                if ram_style == "ultra":
-                    # UltraRAM must have no memory initializer, or only zeroes
-                    # otherwise BRAM will be inferred instead of URAM
-                    # as a workaround we provide a zero-weight init here
-                    synth_weights = np.zeros_like(weights, dtype=np.float32)
-                else:
-                    synth_weights = weights
-                self.make_weight_file(
-                    synth_weights, "decoupled_verilog_dat", weight_filename_rtl_synth
-                )
+                # This file will be ignored when synthesizing UltraScale memory.
+                weight_filename_rtl = "{}/memblock.dat".format(code_gen_dir)
+                self.make_weight_file(weights, "decoupled_verilog_dat", weight_filename_rtl)
         else:
             raise Exception(
                 """Please set mem_mode to "const", "decoupled", or "external",
@@ -960,9 +968,7 @@ def execute_node(self, context, graph):
         elif mode == "rtlsim":
             sim = self.get_rtlsim()
             nbits = self.get_instream_width()
-            inp = npy_to_rtlsim_input(
-                "{}/input_0.npy".format(code_gen_dir), export_idt, nbits
-            )
+            inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits)
             super().reset_rtlsim(sim)
             super().toggle_clk(sim)
             if mem_mode == "external" or mem_mode == "decoupled":
@@ -972,9 +978,7 @@ def execute_node(self, context, graph):
                 # so use it as such for weight generation
                 if self.get_weight_datatype() == DataType["BIPOLAR"]:
                     export_wdt = DataType["BINARY"]
-                wei = npy_to_rtlsim_input(
-                    "{}/weights.npy".format(code_gen_dir), export_wdt, wnbits
-                )
+                wei = npy_to_rtlsim_input("{}/weights.npy".format(code_gen_dir), export_wdt, wnbits)
                 num_w_reps = np.prod(self.get_nodeattr("numInputVectors"))
                 io_dict = {
                     "inputs": {"in0": inp, "weights": wei * num_w_reps},
@@ -989,9 +993,7 @@ def execute_node(self, context, graph):
             packed_bits = self.get_outstream_width()
             out_npy_path = "{}/output.npy".format(code_gen_dir)
             out_shape = self.get_folded_output_shape()
-            rtlsim_output_to_npy(
-                output, out_npy_path, odt, out_shape, packed_bits, target_bits
-            )
+            rtlsim_output_to_npy(output, out_npy_path, odt, out_shape, packed_bits, target_bits)
 
             # load and reshape output
             output = np.load(out_npy_path)
@@ -1051,9 +1053,7 @@ def defines(self, var):
         ]
         if mem_mode == "decoupled" or mem_mode == "external":
             wdt = self.get_weight_datatype()
-            self.code_gen_dict["$DEFINES$"].append(
-                "#define WP1 {}\n".format(wdt.bitwidth())
-            )
+            self.code_gen_dict["$DEFINES$"].append("#define WP1 {}\n".format(wdt.bitwidth()))
 
     def read_npy_data(self):
         code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
@@ -1070,8 +1070,15 @@ def read_npy_data(self):
         self.code_gen_dict["$READNPYDATA$"] = []
         # note: the innermost dim is reversed for the input
         self.code_gen_dict["$READNPYDATA$"].append(
-            'npy2apintstream<%s, %s, %d, %s>("%s", in0, false);'
-            % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in)
+            'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s, false);'
+            % (
+                packed_hls_type,
+                elem_hls_type,
+                elem_bits,
+                npy_type,
+                npy_in,
+                self.hls_sname(),
+            )
         )
 
         mem_mode = self.get_nodeattr("mem_mode")
@@ -1085,24 +1092,35 @@ def read_npy_data(self):
             npy_in = "%s/weights.npy" % code_gen_dir
 
             self.code_gen_dict["$READNPYDATA$"].append(
-                'npy2apintstream<%s, %s, %d, %s>("%s", weights, false, numReps);'
-                % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in)
+                'npy2apintstream<%s, %s, %d, %s>("%s", weights_%s, false, numReps);'
+                % (
+                    packed_hls_type,
+                    elem_hls_type,
+                    elem_bits,
+                    npy_type,
+                    npy_in,
+                    self.hls_sname(),
+                )
             )
 
     def strm_decl(self):
         mem_mode = self.get_nodeattr("mem_mode")
         self.code_gen_dict["$STREAMDECLARATIONS$"] = []
         self.code_gen_dict["$STREAMDECLARATIONS$"].append(
-            'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.get_instream_width())
+            'hls::stream<ap_uint<{}>> in0_{} ("in0_{}");'.format(
+                self.get_instream_width(), self.hls_sname(), self.hls_sname()
+            )
         )
         self.code_gen_dict["$STREAMDECLARATIONS$"].append(
-            'hls::stream<ap_uint<{}>> out ("out");'.format(self.get_outstream_width())
+            'hls::stream<ap_uint<{}>> out_{} ("out_{}");'.format(
+                self.get_outstream_width(), self.hls_sname(), self.hls_sname()
+            )
         )
 
         if mem_mode == "decoupled" or mem_mode == "external":
             self.code_gen_dict["$STREAMDECLARATIONS$"].append(
-                'hls::stream<ap_uint<{}>> weights ("weights");'.format(
-                    self.get_weightstream_width()
+                'hls::stream<ap_uint<{}>> weights_{} ("weights_{}");'.format(
+                    self.get_weightstream_width(), self.hls_sname(), self.hls_sname()
                 )
             )
 
@@ -1122,10 +1140,12 @@ def docompute(self):
         if mem_mode == "const":
             self.code_gen_dict["$DOCOMPUTE$"] = [
                 """Matrix_Vector_Activate_Batch<MW1, MH1, SIMD1, PE1, 1, {}, {}, {}>
-                (in0, out, weights, {}, numReps, {});""".format(
+                (in0_{}, out_{}, weights, {}, numReps, {});""".format(
                     tmpl_args["TSrcI"],
                     tmpl_args["TDstI"],
                     tmpl_args["TWeightI"],
+                    self.hls_sname(),
+                    self.hls_sname(),
                     threshs,
                     map_to_hls_mult_style[self.get_nodeattr("resType")],
                 )
@@ -1139,11 +1159,14 @@ def docompute(self):
             wdtype_hls_str = export_wdt.get_hls_datatype_str()
             self.code_gen_dict["$DOCOMPUTE$"] = [
                 """Matrix_Vector_Activate_Stream_Batch<MW1, MH1, SIMD1, PE1, {}, {}, {}, {} >
-                (in0, out, weights, {}, numReps, {});""".format(
+                (in0_{}, out_{}, weights_{}, {}, numReps, {});""".format(
                     tmpl_args["TSrcI"],
                     tmpl_args["TDstI"],
                     tmpl_args["TWeightI"],
                     wdtype_hls_str,
+                    self.hls_sname(),
+                    self.hls_sname(),
+                    self.hls_sname(),
                     threshs,
                     map_to_hls_mult_style[self.get_nodeattr("resType")],
                 )
@@ -1172,12 +1195,13 @@ def dataoutstrm(self):
 
         # note: the innermost dim is not reversed for the output
         self.code_gen_dict["$DATAOUTSTREAM$"] = [
-            'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s", false);'
+            'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s", false);'
             % (
                 packed_hls_type,
                 elem_hls_type,
                 elem_bits,
                 npy_type,
+                self.hls_sname(),
                 shape_cpp_str,
                 npy_out,
             )
@@ -1190,25 +1214,30 @@ def blackboxfunction(self):
         mem_mode = self.get_nodeattr("mem_mode")
         if mem_mode == "const":
             self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
-                """void {}(hls::stream<ap_uint<{}>> &in0,
-                    hls::stream<ap_uint<{}>> &out
+                """void {}(hls::stream<ap_uint<{}>> &in0_{},
+                    hls::stream<ap_uint<{}>> &out_{}
                     )""".format(
                     self.onnx_node.name,
                     self.get_instream_width(),
+                    self.hls_sname(),
                     self.get_outstream_width(),
+                    self.hls_sname(),
                 )
             ]
         elif mem_mode == "decoupled" or mem_mode == "external":
             self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
                 """void {}(
-                    hls::stream<ap_uint<{}>> &in0,
-                    hls::stream<ap_uint<{}>> &weights,
-                    hls::stream<ap_uint<{}>> &out
+                    hls::stream<ap_uint<{}>> &in0_{},
+                    hls::stream<ap_uint<{}>> &weights_{},
+                    hls::stream<ap_uint<{}>> &out_{}
                     )""".format(
                     self.onnx_node.name,
                     self.get_instream_width(),
+                    self.hls_sname(),
                     self.get_weightstream_width(),
+                    self.hls_sname(),
                     self.get_outstream_width(),
+                    self.hls_sname(),
                 )
             ]
 
@@ -1222,43 +1251,23 @@ def pragmas(self):
         mem_mode = self.get_nodeattr("mem_mode")
         ram_style_thresholds = self.get_nodeattr("ram_style_thresholds")
         self.code_gen_dict["$PRAGMAS$"] = [
-            "#pragma HLS INTERFACE axis port=in0 name=in0_" + self.hls_sname()
+            "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname()
         ]
         self.code_gen_dict["$PRAGMAS$"].append(
-            "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname()
-        )
-        in_fifo_depth = self.get_nodeattr("inFIFODepth")
-        out_fifo_depth = self.get_nodeattr("outFIFODepth")
-        # insert depth pragmas only if specified
-        if in_fifo_depth != 0:
-            self.code_gen_dict["$PRAGMAS$"].append(
-                "#pragma HLS stream depth=%d variable=in0" % in_fifo_depth
-            )
-        if out_fifo_depth != 0:
-            self.code_gen_dict["$PRAGMAS$"].append(
-                "#pragma HLS stream depth=%d variable=out" % out_fifo_depth
-            )
-        self.code_gen_dict["$PRAGMAS$"].append(
-            "#pragma HLS INTERFACE ap_ctrl_none port=return"
+            "#pragma HLS INTERFACE axis port=out_" + self.hls_sname()
         )
+        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return")
 
         if mem_mode == "const":
             self.code_gen_dict["$PRAGMAS$"].append('#include "params.h"')
             # the weight tensor is ap_uint<simd*prec> [PE][WMEM]
             # partition for parallel access along the PE dimension (dim 1)
             self.code_gen_dict["$PRAGMAS$"].append(
-                (
-                    "#pragma HLS ARRAY_PARTITION variable=weights.m_weights "
-                    "complete dim=1"
-                )
+                ("#pragma HLS ARRAY_PARTITION variable=weights.m_weights " "complete dim=1")
             )
         elif mem_mode == "decoupled" or mem_mode == "external":
             self.code_gen_dict["$PRAGMAS$"].append(
-                "#pragma HLS INTERFACE axis port=weights name=weights_"
-                + self.hls_sname()
-            )
-            self.code_gen_dict["$PRAGMAS$"].append(
-                "#pragma HLS stream depth=8 variable=weights"
+                "#pragma HLS INTERFACE axis port=weights_" + self.hls_sname()
             )
 
         else:
@@ -1273,39 +1282,25 @@ def pragmas(self):
         if self.calc_tmem() != 0:
             # TODO find a better way of checking for no pregenerated thresholds
             self.code_gen_dict["$PRAGMAS$"].append(
-                (
-                    "#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds "
-                    "complete dim=1"
-                )
+                ("#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds " "complete dim=1")
             )
             self.code_gen_dict["$PRAGMAS$"].append(
-                (
-                    "#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds "
-                    "complete dim=3"
-                )
+                ("#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds " "complete dim=3")
             )
             # add resource pragma for thresholds if set
             if ram_style_thresholds == "distributed":
                 self.code_gen_dict["$PRAGMAS$"].append(
-                    (
-                        "#pragma HLS RESOURCE variable=threshs.m_thresholds "
-                        "core=ROM_2P_LUTRAM"
-                    )
+                    ("#pragma HLS RESOURCE variable=threshs.m_thresholds " "core=ROM_2P_LUTRAM")
                 )
             elif ram_style_thresholds == "block":
                 self.code_gen_dict["$PRAGMAS$"].append(
-                    (
-                        "#pragma HLS RESOURCE variable=threshs.m_thresholds "
-                        "core=ROM_2P_BRAM"
-                    )
+                    ("#pragma HLS RESOURCE variable=threshs.m_thresholds " "core=ROM_2P_BRAM")
                 )
             elif ram_style_thresholds == "auto":
                 # no pragma needed
                 pass
             else:
-                raise Exception(
-                    "Unrecognized ram_style_thresholds value:" + ram_style_thresholds
-                )
+                raise Exception("Unrecognized ram_style_thresholds value:" + ram_style_thresholds)
 
     def code_generation_ipi(self):
         cmd = []
@@ -1329,8 +1324,7 @@ def code_generation_ipi(self):
             cmd.append("create_bd_pin -dir I -type rst /%s/%s" % (node_name, rst_name))
             cmd.append(
                 "create_bd_intf_pin -mode Master "
-                "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s"
-                % (node_name, dout_name)
+                "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, dout_name)
             )
             cmd.append(
                 "create_bd_intf_pin -mode Slave "
@@ -1342,30 +1336,23 @@ def code_generation_ipi(self):
                 % (self.get_nodeattr("ip_vlnv"), node_name, node_name)
             )
             # instantiate a streamer and connect it to the HLS IP
-            strm_vlnv = "xilinx.com:user:memstream:1.0"
+            strm_vlnv = "amd.com:finn:memstream:1.0"
             strm_inst = node_name + "_wstrm"
             cmd.append(
-                "create_bd_cell -type ip -vlnv %s /%s/%s"
-                % (strm_vlnv, node_name, strm_inst)
+                "create_bd_cell -type ip -vlnv %s /%s/%s" % (strm_vlnv, node_name, strm_inst)
             )
             cmd.append(
                 "set_property -dict [list "
-                "CONFIG.NSTREAMS {1} "
-                "CONFIG.MEM_DEPTH {%d} "
-                "CONFIG.MEM_WIDTH {%d} "
-                "CONFIG.MEM_INIT {%s} "
+                "CONFIG.DEPTH {%d} "
+                "CONFIG.WIDTH {%d} "
+                "CONFIG.INIT_FILE {%s} "
                 "CONFIG.RAM_STYLE {%s} "
-                "CONFIG.STRM0_DEPTH {%d} "
-                "CONFIG.STRM0_WIDTH {%d} "
-                "CONFIG.STRM0_OFFSET {0} "
                 "] [get_bd_cells /%s/%s]"
                 % (
                     self.calc_wmem(),
                     self.get_weightstream_width_padded(),
-                    self.get_nodeattr("code_gen_dir_ipgen") + "/",
+                    self.get_nodeattr("code_gen_dir_ipgen") + "/memblock.dat",
                     self.get_nodeattr("ram_style"),
-                    self.calc_wmem(),
-                    self.get_weightstream_width_padded(),
                     node_name,
                     strm_inst,
                 )
@@ -1376,11 +1363,11 @@ def code_generation_ipi(self):
                 % (node_name, strm_inst, node_name, node_name, sname)
             )
             cmd.append(
-                "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/aresetn]"
+                "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/ap_rst_n]"
                 % (node_name, rst_name, node_name, strm_inst)
             )
             cmd.append(
-                "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/aclk]"
+                "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/ap_clk]"
                 % (node_name, clk_name, node_name, strm_inst)
             )
             cmd.append(
@@ -1406,8 +1393,7 @@ def code_generation_ipi(self):
                 axilite_name = self.get_verilog_top_module_intf_names()["axilite"][0]
                 cmd.append(
                     "create_bd_intf_pin -mode Slave "
-                    "-vlnv xilinx.com:interface:aximm_rtl:1.0 /%s/%s"
-                    % (node_name, axilite_name)
+                    "-vlnv xilinx.com:interface:aximm_rtl:1.0 /%s/%s" % (node_name, axilite_name)
                 )
                 cmd.append(
                     "connect_bd_intf_net [get_bd_intf_pins %s/%s] "
@@ -1429,9 +1415,7 @@ def get_verilog_top_module_intf_names(self):
         mem_mode = self.get_nodeattr("mem_mode")
         sname = self.hls_sname()
         if mem_mode == "external":
-            intf_names["s_axis"].append(
-                ("weights_" + sname, self.get_weightstream_width_padded())
-            )
+            intf_names["s_axis"].append(("weights_" + sname, self.get_weightstream_width_padded()))
         if mem_mode == "decoupled":
             # only expose axilite interface if attribute is set
             runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1
@@ -1462,3 +1446,18 @@ def get_op_and_param_counts(self):
             thres_count = out_features
             ret_dict[thres_param_type] = thres_count
         return ret_dict
+
+    def derive_characteristic_fxns(self, period):
+        n_inps = np.prod(self.get_folded_input_shape()[:-1])
+        io_dict = {
+            "inputs": {
+                "in0": [0 for i in range(n_inps)],
+            },
+            "outputs": {"out": []},
+        }
+        mem_mode = self.get_nodeattr("mem_mode")
+        if mem_mode in ["decoupled", "external"]:
+            n_weight_inps = self.calc_wmem()
+            num_w_reps = np.prod(self.get_nodeattr("numInputVectors"))
+            io_dict["inputs"]["weights"] = [0 for i in range(num_w_reps * n_weight_inps)]
+        super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict)
diff --git a/src/finn/custom_op/fpgadataflow/pool_batch.py b/src/finn/custom_op/fpgadataflow/pool_batch.py
index 3bf187fa9a..8c7bc83141 100644
--- a/src/finn/custom_op/fpgadataflow/pool_batch.py
+++ b/src/finn/custom_op/fpgadataflow/pool_batch.py
@@ -42,12 +42,13 @@ class Pool_Batch(HLSCustomOp):
     Output shape (BatchSize,OutImgDim,OutImgDim,Channels)
 
     Notes:
-    # The input shape was chosen to be compatible with im2col (only true when there
-    is not folding).
 
-    # The actual data layout produced by the hlslib kernels is different
-    for depthwise ops.
-     * depthwise SWG: (1, OFMDim, OFMDim, IFMChannels/PE, K, K, PE)
+    * The input shape was chosen to be compatible with im2col (only true when there
+      is not folding).
+    * The actual data layout produced by the hlslib kernels is different
+      for depthwise ops.
+
+        * depthwise SWG: (1, OFMDim, OFMDim, IFMChannels/PE, K, K, PE)
 
     Channels can be folded using PE (SIMD from the input perspective)
     """
@@ -74,11 +75,11 @@ def get_nodeattr_types(self):
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs
 
-    def get_input_datatype(self):
+    def get_input_datatype(self, ind=0):
         """Returns FINN DataType of input."""
         return DataType[self.get_nodeattr("InputDataType")]
 
-    def get_output_datatype(self):
+    def get_output_datatype(self, ind=0):
         """Returns FINN DataType of output."""
         fxn = self.get_nodeattr("Function")
         odt = DataType[self.get_nodeattr("OutputDataType")]
@@ -98,7 +99,7 @@ def get_output_datatype(self):
 
         return odt
 
-    def get_normal_input_shape(self):
+    def get_normal_input_shape(self, ind=0):
         ifm_ch = self.get_nodeattr("Channels")
         odims = self.get_nodeattr("OutImgDims")
         batch_size = self.get_nodeattr("BatchSize")
@@ -107,7 +108,7 @@ def get_normal_input_shape(self):
         ishape = (batch_size, *odims, k_prod * ifm_ch)
         return ishape
 
-    def get_folded_input_shape(self):
+    def get_folded_input_shape(self, ind=0):
         normal_ishape = list(self.get_normal_input_shape())
         ifm_ch = self.get_nodeattr("Channels")
         pe = self.get_nodeattr("PE")
@@ -116,14 +117,14 @@ def get_folded_input_shape(self):
         folded_ishape = normal_ishape[:-1] + [fold, pe]
         return tuple(folded_ishape)
 
-    def get_normal_output_shape(self):
+    def get_normal_output_shape(self, ind=0):
         ofm_ch = self.get_nodeattr("Channels")
         odims = self.get_nodeattr("OutImgDims")
         batch_size = self.get_nodeattr("BatchSize")
         oshape = (batch_size, *odims, ofm_ch)
         return oshape
 
-    def get_folded_output_shape(self):
+    def get_folded_output_shape(self, ind=0):
         normal_oshape = list(self.get_normal_output_shape())
         ifm_ch = self.get_nodeattr("Channels")
         pe = self.get_nodeattr("PE")
@@ -147,13 +148,13 @@ def get_exp_cycles(self):
         exp_cycles = ((ifm_ch * k_prod) / pe) * np.prod(odims) * batch_size
         return int(exp_cycles)
 
-    def get_instream_width(self):
+    def get_instream_width(self, ind=0):
         dt_bits = self.get_input_datatype().bitwidth()
         pe = self.get_nodeattr("PE")
         in_width = int(dt_bits * pe)
         return in_width
 
-    def get_outstream_width(self):
+    def get_outstream_width(self, ind=0):
         dt_bits = self.get_output_datatype().bitwidth()
         pe = self.get_nodeattr("PE")
         out_width = int(dt_bits * pe)
@@ -190,13 +191,9 @@ def verify_node(self):
         # check supported function
         fnx = self.get_nodeattr("Function")
         if fnx in ["MaxPool", "QuantAvgPool"]:
-            info_messages.append(
-                "Attribute Function contains a supported pool function"
-            )
+            info_messages.append("Attribute Function contains a supported pool function")
         else:
-            info_messages.append(
-                "Attribute Function contains an unsupported pool function"
-            )
+            info_messages.append("Attribute Function contains an unsupported pool function")
         return info_messages
 
     def global_includes(self):
@@ -238,17 +235,28 @@ def read_npy_data(self):
         npy_in = "%s/input_0.npy" % code_gen_dir
         self.code_gen_dict["$READNPYDATA$"] = []
         self.code_gen_dict["$READNPYDATA$"].append(
-            'npy2apintstream<%s, %s, %d, %s>("%s", in0,false);'
-            % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in)
+            'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s, false);'
+            % (
+                packed_hls_type,
+                elem_hls_type,
+                elem_bits,
+                npy_type,
+                npy_in,
+                self.hls_sname(),
+            )
         )
 
     def strm_decl(self):
         self.code_gen_dict["$STREAMDECLARATIONS$"] = []
         self.code_gen_dict["$STREAMDECLARATIONS$"].append(
-            'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.get_instream_width())
+            'hls::stream<ap_uint<{}>> in0_{} ("in0_{}");'.format(
+                self.get_instream_width(), self.hls_sname(), self.hls_sname()
+            )
         )
         self.code_gen_dict["$STREAMDECLARATIONS$"].append(
-            'hls::stream<ap_uint<{}>> out ("out");'.format(self.get_outstream_width())
+            'hls::stream<ap_uint<{}>> out_{} ("out_{}");'.format(
+                self.get_outstream_width(), self.hls_sname(), self.hls_sname()
+            )
         )
 
     def docompute(self):
@@ -271,17 +279,15 @@ def docompute(self):
             else:
                 act_hls_dt = "ap_uint<{}>".format(accum_bits)
             self.code_gen_dict["$DOCOMPUTE$"] += [
-                "QuantAvgPoolFunction<{},{},{}> pool_fxn;".format(
-                    act_hls_dt, o_hls_dt, size
-                )
+                "QuantAvgPoolFunction<{},{},{}> pool_fxn;".format(act_hls_dt, o_hls_dt, size)
             ]
         else:
             raise Exception("Pool_Batch doesn't currently support " + fxn)
 
         self.code_gen_dict["$DOCOMPUTE$"] += [
             """Pool_batch<Channels, PE, KernelSize,Slice<{} >, Slice< {} > >
-        (in0,out, pool_fxn, OFMDimTotal*numReps);""".format(
-                i_hls_dt, o_hls_dt
+        (in0_{}, out_{}, pool_fxn, OFMDimTotal*numReps);""".format(
+                i_hls_dt, o_hls_dt, self.hls_sname(), self.hls_sname()
             )
         ]
 
@@ -301,12 +307,13 @@ def dataoutstrm(self):
         oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}")
 
         self.code_gen_dict["$DATAOUTSTREAM$"] = [
-            'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s",false);'
+            'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s", false);'
             % (
                 packed_hls_type,
                 elem_hls_type,
                 elem_bits,
                 npy_type,
+                self.hls_sname(),
                 oshape_cpp_str,
                 npy_out,
             )
@@ -322,20 +329,24 @@ def blackboxfunction(self):
         packed_obits = self.get_outstream_width()
         packed_out_hls_type = "ap_uint<%d>" % packed_obits
         self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
-            "void %s(hls::stream<%s > &in0, hls::stream<%s > &out)"
-            % (self.onnx_node.name, packed_in_hls_type, packed_out_hls_type)
+            "void %s(hls::stream<%s > &in0_%s, hls::stream<%s > &out_%s)"
+            % (
+                self.onnx_node.name,
+                packed_in_hls_type,
+                self.hls_sname(),
+                packed_out_hls_type,
+                self.hls_sname(),
+            )
         ]
 
     def pragmas(self):
         self.code_gen_dict["$PRAGMAS$"] = [
-            "#pragma HLS INTERFACE axis port=in0 name=in0_" + self.hls_sname()
+            "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname()
         ]
         self.code_gen_dict["$PRAGMAS$"].append(
-            "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname()
-        )
-        self.code_gen_dict["$PRAGMAS$"].append(
-            "#pragma HLS INTERFACE ap_ctrl_none port=return"
+            "#pragma HLS INTERFACE axis port=out_" + self.hls_sname()
         )
+        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return")
 
     def execute_node(self, context, graph):
         mode = self.get_nodeattr("exec_mode")
diff --git a/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py b/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py
index 1e6b72e4d5..baf4aed502 100644
--- a/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py
+++ b/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py
@@ -60,44 +60,53 @@ def get_nodeattr_types(self):
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs
 
-    def get_input_datatype(self):
+    def get_input_datatype(self, ind=0):
         """Returns FINN DataType of input."""
         return DataType[self.get_nodeattr("dataType")]
 
-    def get_output_datatype(self):
+    def get_output_datatype(self, ind=0):
         """Returns FINN DataType of output."""
         return DataType[self.get_nodeattr("dataType")]
 
-    def get_normal_input_shape(self):
+    def get_normal_input_shape(self, ind=0):
         ishape = self.get_nodeattr("shape")
         return ishape
 
-    def get_normal_output_shape(self):
+    def get_normal_output_shape(self, ind=0):
         oshape = self.get_nodeattr("shape")
         return oshape
 
     def check_divisible_iowidths(self):
         impl_style = self.get_nodeattr("impl_style")
-        if impl_style == "hls":
-            # when using impl_style = hls must have the following
-            # if inWidth > outWidth: inWidth % outWidth = 0
-            # if inWidth < outWidth: outWidth % inWidth = 0
-            iwidth = self.get_nodeattr("inWidth")
-            owidth = self.get_nodeattr("outWidth")
-            if iwidth > owidth:
-                assert (
-                    iwidth % owidth == 0
-                ), """DWC InWidth is bigger than OutWidth and is not divisible by it.
-                Please adjust PE and SIMD values so that InWidth % OutWidth = 0
-                or alternatively use impl_style = vivado"""
-            else:
-                assert (
-                    owidth % iwidth == 0
-                ), """DWC OutWidth is bigger than InWidth and is not divisible by it.
-                Please adjust PE and SIMD values so that OutWidth % InWidth = 0
-                or alternatively use impl_style = vivado"""
-
-    def get_folded_input_shape(self):
+        iwidth = self.get_nodeattr("inWidth")
+        owidth = self.get_nodeattr("outWidth")
+        if impl_style == "vivado":
+            # the AXIS IP we use in vivado mode only supports
+            # stream widths that are divisible by 8
+            iwidth_d8 = iwidth % 8 == 0
+            owidth_d8 = owidth % 8 == 0
+            assert (
+                iwidth_d8 and owidth_d8
+            ), """DWC impl_style=vivado requires
+            stream widths that are divisible by 8: (%d, %d)""" % (
+                iwidth,
+                owidth,
+            )
+
+    def get_iowidth_lcm(self):
+        iwidth = self.get_nodeattr("inWidth")
+        owidth = self.get_nodeattr("outWidth")
+        return int(np.lcm(iwidth, owidth))
+
+    def needs_lcm(self):
+        iwidth = self.get_nodeattr("inWidth")
+        owidth = self.get_nodeattr("outWidth")
+        maxwidth = max(iwidth, owidth)
+        minwidth = min(iwidth, owidth)
+        impl_style = self.get_nodeattr("impl_style")
+        return (impl_style == "hls") and (maxwidth % minwidth != 0)
+
+    def get_folded_input_shape(self, ind=0):
         self.check_divisible_iowidths()
         iwidth = self.get_nodeattr("inWidth")
         ishape = self.get_normal_input_shape()
@@ -117,7 +126,7 @@ def get_folded_input_shape(self):
         dummy_t = dummy_t.reshape(new_shape)
         return dummy_t.shape
 
-    def get_folded_output_shape(self):
+    def get_folded_output_shape(self, ind=0):
         self.check_divisible_iowidths()
         owidth = self.get_nodeattr("outWidth")
         oshape = self.get_normal_output_shape()
@@ -142,11 +151,11 @@ def get_number_output_values(self):
         folded_oshape = self.get_folded_output_shape()
         return np.prod(folded_oshape[:-1])
 
-    def get_instream_width(self):
+    def get_instream_width(self, ind=0):
         in_width = self.get_nodeattr("inWidth")
         return in_width
 
-    def get_outstream_width(self):
+    def get_outstream_width(self, ind=0):
         out_width = self.get_nodeattr("outWidth")
         return out_width
 
@@ -202,6 +211,12 @@ def defines(self, var):
             "#define NumInWords %d " % numInWords,
             "#define numReps %d" % numReps,
         ]
+        if self.needs_lcm():
+            lcmWidth = self.get_iowidth_lcm()
+            assert numInWords % (lcmWidth / inWidth) == 0, "Error in DWC LCM calculation"
+            numLCMToOut = numInWords // (lcmWidth / inWidth)
+            self.code_gen_dict["$DEFINES$"].append("#define LCMWidth %d" % lcmWidth)
+            self.code_gen_dict["$DEFINES$"].append("#define NumLCMToOut %d" % (numLCMToOut))
 
     def read_npy_data(self):
         code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
@@ -217,25 +232,54 @@ def read_npy_data(self):
         npy_in = "%s/input_0.npy" % code_gen_dir
         self.code_gen_dict["$READNPYDATA$"] = []
         self.code_gen_dict["$READNPYDATA$"].append(
-            'npy2apintstream<%s, %s, %d, %s>("%s", in0);'
-            % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in)
+            'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s);'
+            % (
+                packed_hls_type,
+                elem_hls_type,
+                elem_bits,
+                npy_type,
+                npy_in,
+                self.hls_sname(),
+            )
         )
 
     def strm_decl(self):
         self.code_gen_dict["$STREAMDECLARATIONS$"] = []
         self.code_gen_dict["$STREAMDECLARATIONS$"].append(
-            'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.get_instream_width())
+            'hls::stream<ap_uint<{}>> in0_{} ("in0_{}");'.format(
+                self.get_instream_width(), self.hls_sname(), self.hls_sname()
+            )
         )
+        if self.needs_lcm():
+            self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+                'hls::stream<ap_uint<{}>> intermediate ("intermediate");'.format(
+                    self.get_iowidth_lcm()
+                )
+            )
         self.code_gen_dict["$STREAMDECLARATIONS$"].append(
-            'hls::stream<ap_uint<{}>> out ("out");'.format(self.get_outstream_width())
+            'hls::stream<ap_uint<{}>> out_{} ("out_{}");'.format(
+                self.get_outstream_width(), self.hls_sname(), self.hls_sname()
+            )
         )
 
     def docompute(self):
         # TODO continue with fxns below, they are copy-pasted
         op = "StreamingDataWidthConverter_Batch"
-        self.code_gen_dict["$DOCOMPUTE$"] = [
-            "%s<InWidth, OutWidth, NumInWords>(in0, out, numReps);" % (op)
-        ]
+        if self.needs_lcm():
+            self.code_gen_dict["$DOCOMPUTE$"] = [
+                'hls::stream<ap_uint<{}>> intermediate ("intermediate");'.format(
+                    self.get_iowidth_lcm()
+                ),
+                "%s<InWidth, LCMWidth, NumInWords>(in0_%s, intermediate, numReps);"
+                % (op, self.hls_sname()),
+                "%s<LCMWidth, OutWidth, NumLCMToOut>(intermediate, out_%s, numReps);"
+                % (op, self.hls_sname()),
+            ]
+        else:
+            self.code_gen_dict["$DOCOMPUTE$"] = [
+                "%s<InWidth, OutWidth, NumInWords>(in0_%s, out_%s, numReps);"
+                % (op, self.hls_sname(), self.hls_sname())
+            ]
 
     def dataoutstrm(self):
         code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
@@ -253,12 +297,13 @@ def dataoutstrm(self):
         oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}")
 
         self.code_gen_dict["$DATAOUTSTREAM$"] = [
-            'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s");'
+            'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s");'
             % (
                 packed_hls_type,
                 elem_hls_type,
                 elem_bits,
                 npy_type,
+                self.hls_sname(),
                 oshape_cpp_str,
                 npy_out,
             )
@@ -273,20 +318,26 @@ def blackboxfunction(self):
         out_packed_bits = self.get_outstream_width()
         out_packed_hls_type = "ap_uint<%d>" % out_packed_bits
         self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
-            "void %s(hls::stream<%s > &in0, hls::stream<%s > &out)"
-            % (self.onnx_node.name, in_packed_hls_type, out_packed_hls_type)
+            "void %s(hls::stream<%s > &in0_%s, hls::stream<%s > &out_%s)"
+            % (
+                self.onnx_node.name,
+                in_packed_hls_type,
+                self.hls_sname(),
+                out_packed_hls_type,
+                self.hls_sname(),
+            )
         ]
 
     def pragmas(self):
         self.code_gen_dict["$PRAGMAS$"] = [
-            "#pragma HLS INTERFACE axis port=in0 name=in0_" + self.hls_sname()
+            "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname()
         ]
         self.code_gen_dict["$PRAGMAS$"].append(
-            "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname()
-        )
-        self.code_gen_dict["$PRAGMAS$"].append(
-            "#pragma HLS INTERFACE ap_ctrl_none port=return"
+            "#pragma HLS INTERFACE axis port=out_" + self.hls_sname()
         )
+        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return")
+        if self.needs_lcm():
+            self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS DATAFLOW disable_start_propagation")
 
     def execute_node(self, context, graph):
         mode = self.get_nodeattr("exec_mode")
@@ -312,9 +363,7 @@ def execute_node(self, context, graph):
 
         inp = context[node.input[0]]
         assert str(inp.dtype) == "float32", "Input datatype is not float32"
-        assert inp.shape == tuple(
-            exp_shape
-        ), "Input shape does not match expected shape."
+        assert inp.shape == tuple(exp_shape), "Input shape does not match expected shape."
 
         if self.get_input_datatype() == DataType["BIPOLAR"]:
             # store bipolar activations as binary
@@ -388,8 +437,7 @@ def code_generation_ipi(self):
             cmd.append("create_bd_pin -dir I -type rst /%s/%s" % (node_name, rst_name))
             cmd.append(
                 "create_bd_intf_pin -mode Master "
-                "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s"
-                % (node_name, dout_name)
+                "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, dout_name)
             )
             cmd.append(
                 "create_bd_intf_pin -mode Slave "
@@ -434,8 +482,7 @@ def code_generation_ipi(self):
             return cmd
         else:
             raise Exception(
-                "DWC implementation style %s not supported, please use hls or vivado"
-                % impl_style
+                "DWC implementation style %s not supported, please use hls or vivado" % impl_style
             )
 
     def lut_estimation(self):
@@ -466,3 +513,28 @@ def lut_estimation(self):
             cset_luts += outw
 
         return int(cnt_luts + cset_luts)
+
+    def prepare_rtlsim(self):
+        assert self.get_nodeattr("impl_style") != "vivado", (
+            "StreamingDataWidthConverter impl_style "
+            "cannot be vivado for rtlsim. Only impl_style=rtl supported."
+        )
+        super().prepare_rtlsim()
+
+    def code_generation_ipgen(self, model, fpgapart, clk):
+        # no codegen required for impl_style=vivado since
+        # that uses premade, configurable AXIS IP
+        if self.get_nodeattr("impl_style") == "hls":
+            super().code_generation_ipgen(model, fpgapart, clk)
+
+    def ipgen_singlenode_code(self):
+        # no IP generation required for impl_style=vivado since
+        # that uses premade, configurable AXIS IP
+        if self.get_nodeattr("impl_style") == "hls":
+            super().ipgen_singlenode_code()
+        else:
+            code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+            # set ipgen_path and ip_path so that HLSSynthIP
+            # and CreatedStitchedIP transformations do not complain
+            self.set_nodeattr("ipgen_path", code_gen_dir)
+            self.set_nodeattr("ip_path", code_gen_dir)
diff --git a/src/finn/custom_op/fpgadataflow/streamingfifo.py b/src/finn/custom_op/fpgadataflow/streamingfifo.py
index a7c3cd0be5..1249bc1251 100644
--- a/src/finn/custom_op/fpgadataflow/streamingfifo.py
+++ b/src/finn/custom_op/fpgadataflow/streamingfifo.py
@@ -41,38 +41,61 @@
 
 
 class StreamingFIFO(HLSCustomOp):
-    def __init__(self, onnx_node):
-        super().__init__(onnx_node)
+    def __init__(self, onnx_node, **kwargs):
+        super().__init__(onnx_node, **kwargs)
         self.strm_fifo_wrapper = templates.strm_fifo_wrapper
 
     def get_nodeattr_types(self):
-        my_attrs = {
-            # FIFO depth
-            "depth": ("i", True, 0),
-            # folded shape of input/output
-            "folded_shape": ("ints", True, []),
-            # FINN DataTypes for inputs/outputs
-            "dataType": ("s", True, ""),
-            # Toggle between hls or IPI implementation
-            # rtl - use the hls generated IP during stitching
-            # vivado - use the AXI Infrastructure FIFO
-            "impl_style": ("s", False, "rtl", {"rtl", "vivado"}),
-            # FPGA resource type for FIFOs when impl_style is vivado
-            # auto -- let Vivado decide
-            # block -- use BRAM
-            # distributed -- use LUTRAM
-            # ultra -- use URAM (on UltraScale+)
-            "ram_style": (
-                "s",
-                False,
-                "auto",
-                {"auto", "block", "distributed", "ultra"},
-            ),
-        }
-        my_attrs.update(super().get_nodeattr_types())
+        my_attrs = super().get_nodeattr_types()
+        my_attrs.update(
+            {
+                # FIFO depth
+                "depth": ("i", True, 0),
+                # folded shape of input/output
+                "folded_shape": ("ints", True, []),
+                # FINN DataTypes for inputs/outputs
+                "dataType": ("s", True, ""),
+                # Toggle between hls or IPI implementation
+                # rtl - use the hls generated IP during stitching
+                # vivado - use the AXI Infrastructure FIFO
+                "impl_style": ("s", False, "rtl", {"rtl", "vivado"}),
+                # FPGA resource type for FIFOs when impl_style is vivado
+                # auto -- let Vivado decide
+                # block -- use BRAM
+                # distributed -- use LUTRAM
+                # ultra -- use URAM (on UltraScale+)
+                "ram_style": (
+                    "s",
+                    False,
+                    "auto",
+                    {"auto", "block", "distributed", "ultra"},
+                ),
+                # whether depth monitoring is enabled (impl_style=rtl only)
+                "depth_monitor": ("i", False, 0),
+                # the FIFO does not need its own FIFOs
+                "inFIFODepths": ("ints", False, [0]),
+                "outFIFODepths": ("ints", False, [0]),
+            }
+        )
 
         return my_attrs
 
+    def get_adjusted_depth(self):
+        impl = self.get_nodeattr("impl_style")
+        depth = self.get_nodeattr("depth")
+        if impl == "vivado":
+            old_depth = depth
+            # round up depth to nearest power-of-2
+            # Vivado FIFO impl may fail otherwise
+            depth = (1 << (depth - 1).bit_length()) if impl == "vivado" else depth
+            if old_depth != depth:
+                warnings.warn(
+                    "%s: rounding-up FIFO depth from %d to %d for impl_style=vivado"
+                    % (self.onnx_node.name, old_depth, depth)
+                )
+
+        return depth
+
     def make_shape_compatible_op(self, model):
         exp_ishape = self.get_normal_input_shape()
         oshape = self.get_normal_output_shape()
@@ -97,6 +120,14 @@ def infer_node_datatype(self, model):
     def verify_node(self):
         pass
 
+    def get_verilog_top_module_intf_names(self):
+        ret = super().get_verilog_top_module_intf_names()
+        is_rtl = self.get_nodeattr("impl_style") == "rtl"
+        is_depth_monitor = self.get_nodeattr("depth_monitor") == 1
+        if is_rtl and is_depth_monitor:
+            ret["ap_none"] = ["maxcount"]
+        return ret
+
     def get_verilog_top_module_name(self):
         "Return the Verilog top module name for this node."
 
@@ -106,9 +137,7 @@ def get_verilog_top_module_name(self):
 
     def code_generation_ipgen(self, model, fpgapart, clk):
         code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
-        verilog_dir = "{}/project_{}/sol1/impl/verilog".format(
-            code_gen_dir, self.onnx_node.name
-        )
+        verilog_dir = "{}/project_{}/sol1/impl/verilog".format(code_gen_dir, self.onnx_node.name)
         os.makedirs(verilog_dir)
         # copy Q_srl.v from finn-rtllib to verilog directory
         memstream_dir = get_finn_root() + "/finn-rtllib/memstream/hdl/"
@@ -144,9 +173,7 @@ def code_generation_ipgen(self, model, fpgapart, clk):
 
     def ipgen_singlenode_code(self):
         code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
-        verilog_dir = "{}/project_{}/sol1/impl/verilog".format(
-            code_gen_dir, self.onnx_node.name
-        )
+        verilog_dir = "{}/project_{}/sol1/impl/verilog".format(code_gen_dir, self.onnx_node.name)
         # prepare the IP packaging tcl template
         template = templates.ip_package_tcl
         self.code_gen_dict.clear()
@@ -180,15 +207,11 @@ def ipgen_singlenode_code(self):
         self.set_nodeattr("ip_vlnv", vlnv)
         self.code_gen_dict.clear()
 
-    def get_normal_input_shape(self):
-        depth = self.get_nodeattr("depth")
-        # depth has to be between 2 and 256 with the current
-        # StreamingFIFO implementation
+    def get_normal_input_shape(self, ind=0):
+        depth = self.get_adjusted_depth()
         assert depth >= 2, """Depth is too low"""
         if depth > 256 and self.get_nodeattr("impl_style") == "rtl":
-            warnings.warn(
-                "Depth is high, set between 2 and 256 for efficient SRL implementation"
-            )
+            warnings.warn("Depth is high, set between 2 and 256 for efficient SRL implementation")
         # derive normal shape from folded shape
         # StreamingFIFOs are inserted in between fpgadataflow nodes
         # the folded shape could be for example (1, nf, pe)
@@ -211,27 +234,33 @@ def get_normal_input_shape(self):
 
         return normal_ishape
 
-    def get_normal_output_shape(self):
+    def get_normal_output_shape(self, ind=0):
         return self.get_normal_input_shape()
 
-    def get_folded_input_shape(self):
+    def get_folded_input_shape(self, ind=0):
         return self.get_nodeattr("folded_shape")
 
-    def get_folded_output_shape(self):
+    def get_folded_output_shape(self, ind=0):
         return self.get_nodeattr("folded_shape")
 
-    def get_instream_width(self):
+    def get_instream_width(self, ind=0):
         dtype = DataType[self.get_nodeattr("dataType")]
         folded_shape = self.get_nodeattr("folded_shape")
         in_width = folded_shape[-1] * dtype.bitwidth()
         return in_width
 
-    def get_outstream_width(self):
+    def get_outstream_width(self, ind=0):
         dtype = DataType[self.get_nodeattr("dataType")]
         folded_shape = self.get_nodeattr("folded_shape")
         in_width = folded_shape[-1] * dtype.bitwidth()
         return in_width
 
+    def get_input_datatype(self, ind=0):
+        return DataType[self.get_nodeattr("dataType")]
+
+    def get_output_datatype(self, ind=0):
+        return DataType[self.get_nodeattr("dataType")]
+
     def execute_node(self, context, graph):
         mode = self.get_nodeattr("exec_mode")
         node = self.onnx_node
@@ -262,9 +291,7 @@ def execute_node(self, context, graph):
             np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input)
             sim = self.get_rtlsim()
             nbits = self.get_instream_width()
-            inp = npy_to_rtlsim_input(
-                "{}/input_0.npy".format(code_gen_dir), export_idt, nbits
-            )
+            inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits)
             super().reset_rtlsim(sim)
             super().toggle_clk(sim)
             output = self.rtlsim(sim, inp)
@@ -273,9 +300,7 @@ def execute_node(self, context, graph):
             packed_bits = self.get_outstream_width()
             out_npy_path = "{}/output.npy".format(code_gen_dir)
             out_shape = self.get_folded_output_shape()
-            rtlsim_output_to_npy(
-                output, out_npy_path, odt, out_shape, packed_bits, target_bits
-            )
+            rtlsim_output_to_npy(output, out_npy_path, odt, out_shape, packed_bits, target_bits)
             # load and reshape output
             output = np.load(out_npy_path)
             oshape = self.get_normal_output_shape()
@@ -328,7 +353,7 @@ def code_generation_ipi(self):
         elif impl_style == "vivado":
             cmd = []
             node_name = self.onnx_node.name
-            depth = self.get_nodeattr("depth")
+            depth = self.get_adjusted_depth()
             ram_style = self.get_nodeattr("ram_style")
             # create a hierarchy for this layer, with the same port names
             clk_name = self.get_verilog_top_module_intf_names()["clk"][0]
@@ -340,8 +365,7 @@ def code_generation_ipi(self):
             cmd.append("create_bd_pin -dir I -type rst /%s/%s" % (node_name, rst_name))
             cmd.append(
                 "create_bd_intf_pin -mode Master "
-                "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s"
-                % (node_name, dout_name)
+                "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, dout_name)
             )
             cmd.append(
                 "create_bd_intf_pin -mode Slave "
@@ -362,8 +386,7 @@ def code_generation_ipi(self):
             )
             cmd.append(
                 "set_property -dict [list CONFIG.TDATA_NUM_BYTES {%d}] "
-                "[get_bd_cells /%s/fifo]"
-                % (np.ceil(self.get_outstream_width() / 8), node_name)
+                "[get_bd_cells /%s/fifo]" % (np.ceil(self.get_outstream_width() / 8), node_name)
             )
             cmd.append(
                 "connect_bd_intf_net [get_bd_intf_pins %s/fifo/M_AXIS] "
@@ -375,8 +398,7 @@ def code_generation_ipi(self):
             )
             cmd.append(
                 "connect_bd_net [get_bd_pins %s/%s] "
-                "[get_bd_pins %s/fifo/s_axis_aresetn]"
-                % (node_name, rst_name, node_name)
+                "[get_bd_pins %s/fifo/s_axis_aresetn]" % (node_name, rst_name, node_name)
             )
             cmd.append(
                 "connect_bd_net [get_bd_pins %s/%s] "
@@ -385,15 +407,14 @@ def code_generation_ipi(self):
             return cmd
         else:
             raise Exception(
-                "FIFO implementation style %s not supported, please use rtl or vivado"
-                % impl_style
+                "FIFO implementation style %s not supported, please use rtl or vivado" % impl_style
             )
 
     def bram_estimation(self):
         """Calculates resource estimation for BRAM"""
         impl = self.get_nodeattr("impl_style")
         ram_type = self.get_nodeattr("ram_style")
-        depth = self.get_nodeattr("depth")
+        depth = self.get_adjusted_depth()
         W = self.get_instream_width()
 
         if impl == "rtl" or (impl == "vivado" and ram_type != "block"):
@@ -418,7 +439,7 @@ def uram_estimation(self):
 
         impl = self.get_nodeattr("impl_style")
         ram_type = self.get_nodeattr("ram_style")
-        depth = self.get_nodeattr("depth")
+        depth = self.get_adjusted_depth()
         W = self.get_instream_width()
 
         if impl == "rtl" or (impl == "vivado" and ram_type != "ultra"):
@@ -428,7 +449,7 @@ def uram_estimation(self):
             return (math.ceil(depth / 4096)) * (math.ceil(W / 72))
 
     def bram_efficiency_estimation(self):
-        depth = self.get_nodeattr("depth")
+        depth = self.get_adjusted_depth()
         W = self.get_instream_width()
         bram16_est = self.bram_estimation()
         if bram16_est == 0:
@@ -441,7 +462,7 @@ def lut_estimation(self):
         """Calculates resource estimations for LUTs"""
         impl = self.get_nodeattr("impl_style")
         ram_type = self.get_nodeattr("ram_style")
-        depth = self.get_nodeattr("depth")
+        depth = self.get_adjusted_depth()
         W = self.get_instream_width()
 
         address_luts = 2 * math.ceil(math.log(depth, 2))
diff --git a/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py b/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py
index 882b40a0aa..8f294da4ac 100755
--- a/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py
+++ b/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py
@@ -57,11 +57,11 @@ def get_nodeattr_types(self):
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs
 
-    def get_input_datatype(self):
+    def get_input_datatype(self, ind=0):
         """Returns FINN DataType of input."""
         return DataType[self.get_nodeattr("dataType")]
 
-    def get_output_datatype(self):
+    def get_output_datatype(self, ind=0):
         """Returns FINN DataType of output."""
         return DataType[self.get_nodeattr("dataType")]
 
@@ -82,13 +82,13 @@ def is_1d(self):
         ifm_dim, k, ifm_ch = self.get_1d_attrs_normalized()
         return (ifm_dim[0] == 1) and (k[0] == 1)
 
-    def get_normal_input_shape(self):
+    def get_normal_input_shape(self, ind=0):
         ifm_dim_h, ifm_dim_w = self.get_nodeattr("ImgDim")
         ifm_ch = self.get_nodeattr("NumChannels")
         ishape = (1, ifm_dim_h, ifm_dim_w, ifm_ch)
         return ishape
 
-    def get_folded_input_shape(self):
+    def get_folded_input_shape(self, ind=0):
         ifm_dim_h, ifm_dim_w = self.get_nodeattr("ImgDim")
         ifm_ch = self.get_nodeattr("NumChannels")
         pe = self.get_nodeattr("PE")
@@ -99,24 +99,20 @@ def get_folded_input_shape(self):
             folded_ishape = (1, ifm_dim_h, ifm_dim_w, 1, ifm_ch)
         return folded_ishape
 
-    def get_normal_output_shape(self):
+    def get_normal_output_shape(self, ind=0):
         ifm_dim_h, ifm_dim_w = self.get_nodeattr("ImgDim")
         k_h, k_w = tuple(self.get_nodeattr("PoolDim"))
         ifm_ch = self.get_nodeattr("NumChannels")
         ceil_mode = self.get_nodeattr("CeilMode")
         if not self.is_1d():
-            assert (
-                ifm_dim_h % k_h == 0
-            ), "StreamingMaxPool needs ImgDim_h % PoolDim_h == 0"
-            assert (
-                ifm_dim_w % k_w == 0
-            ), "StreamingMaxPool needs ImgDim_w % PoolDim_w == 0"
+            assert ifm_dim_h % k_h == 0, "StreamingMaxPool needs ImgDim_h % PoolDim_h == 0"
+            assert ifm_dim_w % k_w == 0, "StreamingMaxPool needs ImgDim_w % PoolDim_w == 0"
         ofm_dim_h = compute_pool_output_dim(ifm_dim_h, k_h, k_h, 0, ceil_mode)
         ofm_dim_w = compute_pool_output_dim(ifm_dim_w, k_w, k_w, 0, ceil_mode)
         oshape = (1, ofm_dim_h, ofm_dim_w, ifm_ch)
         return oshape
 
-    def get_folded_output_shape(self):
+    def get_folded_output_shape(self, ind=0):
         # even though there is no folding in the current hlslib op,
         # insert a time multiplexing axis to remain compatible with the
         # shapes produced by the rest of the dataflow pipeline
@@ -155,7 +151,7 @@ def get_exp_cycles(self):
             # TODO: adjust inaccurate formula
             return int(ifm_dim[1] * ifm_dim[1] * (1 + 1 / (k[1] * k[1])))
 
-    def get_instream_width(self):
+    def get_instream_width(self, ind=0):
         dt_bits = self.get_input_datatype().bitwidth()
         pe = self.get_nodeattr("PE")
         ifm_ch = self.get_nodeattr("NumChannels")
@@ -165,7 +161,7 @@ def get_instream_width(self):
             in_width = int(dt_bits * ifm_ch)
         return in_width
 
-    def get_outstream_width(self):
+    def get_outstream_width(self, ind=0):
         """For streaming maxpool out stream width is the same as in stream width"""
         return self.get_instream_width()
 
@@ -254,17 +250,28 @@ def read_npy_data(self):
         npy_in = "%s/input_0.npy" % code_gen_dir
         self.code_gen_dict["$READNPYDATA$"] = []
         self.code_gen_dict["$READNPYDATA$"].append(
-            'npy2apintstream<%s, %s, %d, %s>("%s", in0);'
-            % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in)
+            'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s);'
+            % (
+                packed_hls_type,
+                elem_hls_type,
+                elem_bits,
+                npy_type,
+                npy_in,
+                self.hls_sname(),
+            )
         )
 
     def strm_decl(self):
         self.code_gen_dict["$STREAMDECLARATIONS$"] = []
         self.code_gen_dict["$STREAMDECLARATIONS$"].append(
-            'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.get_instream_width())
+            'hls::stream<ap_uint<{}>> in0_{} ("in0_{}");'.format(
+                self.get_instream_width(), self.hls_sname(), self.hls_sname()
+            )
         )
         self.code_gen_dict["$STREAMDECLARATIONS$"].append(
-            'hls::stream<ap_uint<{}>> out ("out");'.format(self.get_outstream_width())
+            'hls::stream<ap_uint<{}>> out_{} ("out_{}");'.format(
+                self.get_outstream_width(), self.hls_sname(), self.hls_sname()
+            )
         )
 
     def docompute(self):
@@ -275,7 +282,8 @@ def docompute(self):
             else:
                 op = "StreamingMaxPool"
             self.code_gen_dict["$DOCOMPUTE$"] = [
-                "%s<ImgDim, PoolDim, NumChannels>(in0, out);" % (op)
+                "%s<ImgDim, PoolDim, NumChannels>(in0_%s, out_%s);"
+                % (op, self.hls_sname(), self.hls_sname())
             ]
         else:
             dtype = self.get_input_datatype()
@@ -285,14 +293,14 @@ def docompute(self):
                 op = "StreamingMaxPool_Precision_1d"
                 self.code_gen_dict["$DOCOMPUTE$"] = [
                     """%s<ImgDim, PoolDim, NumChannels, PE,
-                     OutputSize, %s, %s>(in0, out);"""
-                    % (op, dtype_hls, minval_str)
+                     OutputSize, %s, %s>(in0_%s, out_%s);"""
+                    % (op, dtype_hls, minval_str, self.hls_sname(), self.hls_sname())
                 ]
             else:
                 op = "StreamingMaxPool_Precision"
                 self.code_gen_dict["$DOCOMPUTE$"] = [
-                    "%s<ImgDim, PoolDim, NumChannels, %s, %s>(in0, out);"
-                    % (op, dtype_hls, minval_str)
+                    "%s<ImgDim, PoolDim, NumChannels, %s, %s>(in0_%s, out_%s);"
+                    % (op, dtype_hls, minval_str, self.hls_sname(), self.hls_sname())
                 ]
 
     def dataoutstrm(self):
@@ -311,12 +319,13 @@ def dataoutstrm(self):
         oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}")
 
         self.code_gen_dict["$DATAOUTSTREAM$"] = [
-            'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s");'
+            'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s");'
             % (
                 packed_hls_type,
                 elem_hls_type,
                 elem_bits,
                 npy_type,
+                self.hls_sname(),
                 oshape_cpp_str,
                 npy_out,
             )
@@ -329,20 +338,24 @@ def blackboxfunction(self):
         packed_bits = self.get_instream_width()
         packed_hls_type = "ap_uint<%d>" % packed_bits
         self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
-            "void %s(hls::stream<%s > &in0, hls::stream<%s > &out)"
-            % (self.onnx_node.name, packed_hls_type, packed_hls_type)
+            "void %s(hls::stream<%s > &in0_%s, hls::stream<%s > &out_%s)"
+            % (
+                self.onnx_node.name,
+                packed_hls_type,
+                self.hls_sname(),
+                packed_hls_type,
+                self.hls_sname(),
+            )
         ]
 
     def pragmas(self):
         self.code_gen_dict["$PRAGMAS$"] = [
-            "#pragma HLS INTERFACE axis port=in0 name=in0_" + self.hls_sname()
+            "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname()
         ]
         self.code_gen_dict["$PRAGMAS$"].append(
-            "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname()
-        )
-        self.code_gen_dict["$PRAGMAS$"].append(
-            "#pragma HLS INTERFACE ap_ctrl_none port=return"
+            "#pragma HLS INTERFACE axis port=out_" + self.hls_sname()
         )
+        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return")
 
     def execute_node(self, context, graph):
         mode = self.get_nodeattr("exec_mode")
diff --git a/src/finn/custom_op/fpgadataflow/templates.py b/src/finn/custom_op/fpgadataflow/templates.py
index e73fa9bb28..4e03e6daf9 100644
--- a/src/finn/custom_op/fpgadataflow/templates.py
+++ b/src/finn/custom_op/fpgadataflow/templates.py
@@ -109,107 +109,6 @@
 exit 0
 """
 
-# verilog wrapper for decoupled mem mode
-decoupled_wrapper = """
-module $TOPNAME$(
-ap_clk,
-ap_rst_n,
-in0_$HLS_SNAME$_TDATA,
-in0_$HLS_SNAME$_TVALID,
-in0_$HLS_SNAME$_TREADY,
-out_$HLS_SNAME$_TDATA,
-out_$HLS_SNAME$_TVALID,
-out_$HLS_SNAME$_TREADY
-);
-
-input   ap_clk;
-input   ap_rst_n;
-input  $IN_RANGE$ in0_$HLS_SNAME$_TDATA;
-input   in0_$HLS_SNAME$_TVALID;
-output   in0_$HLS_SNAME$_TREADY;
-output  $OUT_RANGE$ out_$HLS_SNAME$_TDATA;
-output   out_$HLS_SNAME$_TVALID;
-input   out_$HLS_SNAME$_TREADY;
-
-reg [31:0] config_address = 0;
-reg config_ce = 0;
-reg config_we = 0;
-reg [31:0] config_d0 = 0;
-wire [31:0] config_q0;
-
-//multiple wire AXI Streams
-wire m_axis_0_afull;
-// FIFO count to generate programmable full
-wire [5:0] fifo_0_count;
-wire m_axis_0_tready;
-wire m_axis_0_tvalid;
-wire $WEIGHT_RANGE$ m_axis_0_tdata;
-
-//memstream component
-
-memstream
-#(
-//parameters to enable/disable axi-mm, set number of streams, set readmemh for
-// memory, set per-stream offsets in memory, set per-stream widths
-.CONFIG_EN(1),
-.NSTREAMS(1),
-.MEM_DEPTH($MEM_DEPTH$),
-.MEM_WIDTH($WEIGHT_WIDTH$),
-.MEM_INIT("./"),
-.RAM_STYLE("$RAM_STYLE$"),
-
-//widths per stream
-.STRM0_WIDTH($WEIGHT_WIDTH$),
-
-//depths per stream
-.STRM0_DEPTH($WSTREAM_DEPTH$),
-
-//offsets for each stream
-.STRM0_OFFSET(0)
-)
-mem
-(
-.aclk(ap_clk),
-.aresetn(ap_rst_n),
-
-//optional configuration interface compatible with ap_memory
-.config_address(config_address),
-.config_ce(config_ce),
-.config_we(config_we),
-.config_d0(config_d0),
-.config_q0(config_q0),
-
-//multiple output AXI Streams, TDATA width rounded to multiple of 8 bits
-.m_axis_0_afull(m_axis_0_afull),
-.m_axis_0_tready(m_axis_0_tready),
-.m_axis_0_tvalid(m_axis_0_tvalid),
-.m_axis_0_tdata(m_axis_0_tdata)
-
-
-);
-
-
-//MVA_Stream_Unit
-
-$LAYER_NAME$
-MVA_Stream_U
-(
-.ap_clk(ap_clk),			//input
-.ap_rst_n(ap_rst_n), 			//input
-.in0_$HLS_SNAME$_TDATA(in0_$HLS_SNAME$_TDATA),		//$IN_RANGE$ input
-.in0_$HLS_SNAME$_TVALID(in0_$HLS_SNAME$_TVALID),  	//input
-.in0_$HLS_SNAME$_TREADY(in0_$HLS_SNAME$_TREADY),	//output
-.weights_$HLS_SNAME$_TDATA(m_axis_0_tdata),	//$WEIGHT_RANGE$ input
-.weights_$HLS_SNAME$_TVALID(m_axis_0_tvalid),	//input
-.weights_$HLS_SNAME$_TREADY(m_axis_0_tready),	//output
-.out_$HLS_SNAME$_TDATA(out_$HLS_SNAME$_TDATA),		//$OUT_RANGE$ output
-.out_$HLS_SNAME$_TVALID(out_$HLS_SNAME$_TVALID),	//output
-.out_$HLS_SNAME$_TREADY(out_$HLS_SNAME$_TREADY)		//input
-);
-
-endmodule
-"""
-
 ip_package_tcl = """
 ## IP Info
 set Vendor      "xilinx.com"
@@ -319,6 +218,7 @@
 ap_clk,
 ap_rst_n,
 count,
+maxcount,
 in0_$HLS_SNAME$_TDATA,
 in0_$HLS_SNAME$_TVALID,
 in0_$HLS_SNAME$_TREADY,
@@ -330,6 +230,7 @@
 input   ap_clk;
 input   ap_rst_n;
 output $COUNT_RANGE$ count;
+output $COUNT_RANGE$ maxcount;
 input  $IN_RANGE$ in0_$HLS_SNAME$_TDATA;
 input   in0_$HLS_SNAME$_TVALID;
 output   in0_$HLS_SNAME$_TREADY;
@@ -346,6 +247,7 @@
  .clock(ap_clk),
  .reset(!ap_rst_n),
  .count(count),
+ .maxcount(maxcount),
  .i_d(in0_$HLS_SNAME$_TDATA),
  .i_v(in0_$HLS_SNAME$_TVALID),
  .i_r(in0_$HLS_SNAME$_TREADY),
diff --git a/src/finn/custom_op/fpgadataflow/thresholding_batch.py b/src/finn/custom_op/fpgadataflow/thresholding_batch.py
index 5383cc1f4b..72ee2f7af6 100644
--- a/src/finn/custom_op/fpgadataflow/thresholding_batch.py
+++ b/src/finn/custom_op/fpgadataflow/thresholding_batch.py
@@ -45,8 +45,6 @@
     rtlsim_output_to_npy,
 )
 
-from . import templates
-
 # ONNX i/o tensor shape assumptions for Thresholding:
 # input 0 is the input tensor, shape (..., NumChannels)
 # input 1 is the threshold tensor, shape (NumChannels, n_thres)
@@ -57,9 +55,8 @@
 class Thresholding_Batch(HLSCustomOp):
     """Class that corresponds to finn-hls Thresholding_Batch function."""
 
-    def __init__(self, onnx_node):
-        super().__init__(onnx_node)
-        self.decoupled_wrapper = templates.decoupled_wrapper
+    def __init__(self, onnx_node, **kwargs):
+        super().__init__(onnx_node, **kwargs)
 
     def get_nodeattr_types(self):
         my_attrs = {
@@ -75,9 +72,6 @@ def get_nodeattr_types(self):
             "inputDataType": ("s", True, ""),
             "weightDataType": ("s", True, ""),
             "outputDataType": ("s", True, ""),
-            # input and output FIFO depths
-            "inFIFODepth": ("i", False, 0),
-            "outFIFODepth": ("i", False, 0),
             # number of input vectors, examples:
             # [1] is a single vector (like a FC layer with batch=1)
             # [4] is four vectors (like a FC layer with batch=4)
@@ -148,9 +142,7 @@ def verify_node(self):
             self.get_nodeattr("outputDataType")
             info_messages.append("All necessary attributes exist")
         except Exception:
-            info_messages.append(
-                """The required Threshold_Batch attributes do not exist."""
-            )
+            info_messages.append("""The required Threshold_Batch attributes do not exist.""")
 
         return info_messages
 
@@ -185,11 +177,11 @@ def lut_estimation(self):
         # total cost
         return comparator_cost + lutram_cost
 
-    def get_input_datatype(self):
+    def get_input_datatype(self, ind=0):
         """Returns FINN DataType of input."""
         return DataType[self.get_nodeattr("inputDataType")]
 
-    def get_output_datatype(self):
+    def get_output_datatype(self, ind=0):
         """Returns FINN DataType of output."""
         return DataType[self.get_nodeattr("outputDataType")]
 
@@ -219,13 +211,15 @@ def minimize_accumulator_width(self, model):
             threshold_tensor
         ).all(), "Thresholds can't be expressed with type %s" % str(tdt)
         self.set_nodeattr("weightDataType", tdt.name)
+        # Update QONNX DataType of tensor for consistency
+        model.set_tensor_datatype(self.onnx_node.input[1], tdt)
         return DataType[self.get_nodeattr("weightDataType")]
 
-    def get_instream_width(self):
+    def get_instream_width(self, ind=0):
         i_bits = self.get_input_datatype().bitwidth()
         return i_bits * self.get_nodeattr("PE")
 
-    def get_outstream_width(self):
+    def get_outstream_width(self, ind=0):
         o_bits = self.get_output_datatype().bitwidth()
         return o_bits * self.get_nodeattr("PE")
 
@@ -251,7 +245,7 @@ def get_ap_int_max_w(self):
         weightstream = self.get_weightstream_width()
         return max([weightstream, temp_value])
 
-    def get_folded_input_shape(self):
+    def get_folded_input_shape(self, ind=0):
         ich = self.get_nodeattr("NumChannels")
         pe = self.get_nodeattr("PE")
         fold = ich // pe
@@ -259,17 +253,17 @@ def get_folded_input_shape(self):
         folded_input_shape = tuple(vecs + [fold, pe])
         return folded_input_shape
 
-    def get_folded_output_shape(self):
+    def get_folded_output_shape(self, ind=0):
         # same shape as input
         return self.get_folded_input_shape()
 
-    def get_normal_input_shape(self):
+    def get_normal_input_shape(self, ind=0):
         ich = self.get_nodeattr("NumChannels")
         vecs = list(self.get_nodeattr("numInputVectors"))
         normal_input_shape = tuple(vecs + [ich])
         return normal_input_shape
 
-    def get_normal_output_shape(self):
+    def get_normal_output_shape(self, ind=0):
         # same shape as input
         return self.get_normal_input_shape()
 
@@ -311,30 +305,17 @@ def get_hls_compatible_threshold_tensor(self, orig_thres_matrix):
         ), """Threshold matrix dimension is
         not as expected (2)."""
         n_thres_steps = orig_thres_matrix.shape[1]
-        assert n_thres_steps == self.get_nodeattr(
-            "numSteps"
-        ), "Mismatch in threshold steps"
+        assert n_thres_steps == self.get_nodeattr("numSteps"), "Mismatch in threshold steps"
         if not self.get_input_datatype().signed():
             # ensure all thresholds are nonnegative
             assert (orig_thres_matrix >= 0).all()
         # ensure all thresholds are integer
-        assert np.equal(
-            np.mod(orig_thres_matrix, 1), 0
-        ).all(), "Need int threshold tensor"
+        assert np.equal(np.mod(orig_thres_matrix, 1), 0).all(), "Need int threshold tensor"
         ret = orig_thres_matrix
-        # workaround for vivado_hls threshold bug
-        if ret[0][0] == 0 and n_thres_steps == 1:
-            ret = np.copy(ret)
-            ret[0][0] = 1
-            warnings.warn(
-                "Setting 0-valued first threshold to 1 to avoid vivado_hls bug"
-            )
         # ensure channels = mh , duplicating if necessary
         if ret.shape[0] == 1:
             ret = np.tile(ret, (mh, 1))
-        assert (
-            ret.shape[0] == mh
-        ), "Channels of threshold matrix are not as expected (mh)"
+        assert ret.shape[0] == mh, "Channels of threshold matrix are not as expected (mh)"
         # distribute rows between PEs
         ret = interleave_matrix_outer_dim_from_partitions(ret, pe)
         assert (
@@ -357,10 +338,12 @@ def make_weight_file(self, weights, weight_file_mode, weight_file_name):
         run-time reconfig of weights.
 
         Arguments:
+
         * weights : numpy array with weights to be put into the file
         * weight_file_mode : one of {hls_header, decoupled_verilog_dat,
           decoupled_runtime}
         * weight_file_name : filename for the weight file to be generated
+
         """
         threshold_tensor = self.get_hls_compatible_threshold_tensor(weights)
         tdt = self.get_weight_datatype()
@@ -465,27 +448,9 @@ def generate_params(self, model, path):
             weight_filename_sim = "{}/thresholds.npy".format(code_gen_dir)
             self.make_weight_file(thresholds, "decoupled_npy", weight_filename_sim)
             # also save weights as Verilog .dat file
-            # note that we provide two different .dat files, one for synth
-            # and one for synthesis. this is because URAM-based weights always
-            # need zero weights for synthesis, otherwise they get inferred
-            # as BRAM
-            weight_filename_rtl_synth = "{}/memblock_synth_0.dat".format(code_gen_dir)
-            weight_filename_rtl_sim = "{}/memblock_sim_0.dat".format(code_gen_dir)
-            # sim weights are always the true weights
-            self.make_weight_file(
-                thresholds, "decoupled_verilog_dat", weight_filename_rtl_sim
-            )
-            ram_style = self.get_nodeattr("ram_style")
-            if ram_style == "ultra":
-                # UltraRAM must have no memory initializer, or only zeroes
-                # otherwise BRAM will be inferred instead of URAM
-                # as a workaround we provide a zero-weight init here
-                synth_thresholds = np.zeros_like(thresholds, dtype=np.float32)
-            else:
-                synth_thresholds = thresholds
-            self.make_weight_file(
-                synth_thresholds, "decoupled_verilog_dat", weight_filename_rtl_synth
-            )
+            # This file will be ignored when synthesizing UltraScale memory.
+            weight_filename_rtl = "{}/memblock.dat".format(code_gen_dir)
+            self.make_weight_file(thresholds, "decoupled_verilog_dat", weight_filename_rtl)
         else:
             raise Exception("Unrecognized mem_mode")
 
@@ -546,15 +511,11 @@ def execute_node(self, context, graph):
                 out = 2 * out - 1
                 context[node.output[0]] = out
             oshape = self.get_normal_output_shape()
-            assert (
-                context[node.output[0]].shape == oshape
-            ), """Output shape is not as expected"""
+            assert context[node.output[0]].shape == oshape, """Output shape is not as expected"""
         elif mode == "rtlsim":
             sim = self.get_rtlsim()
             nbits = self.get_instream_width()
-            inp = npy_to_rtlsim_input(
-                "{}/input_0.npy".format(code_gen_dir), export_idt, nbits
-            )
+            inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits)
             super().reset_rtlsim(sim)
             super().toggle_clk(sim)
             if self.get_nodeattr("mem_mode") == "decoupled":
@@ -579,9 +540,7 @@ def execute_node(self, context, graph):
             packed_bits = self.get_outstream_width()
             out_npy_path = "{}/output.npy".format(code_gen_dir)
             out_shape = self.get_folded_output_shape()
-            rtlsim_output_to_npy(
-                output, out_npy_path, odt, out_shape, packed_bits, target_bits
-            )
+            rtlsim_output_to_npy(output, out_npy_path, odt, out_shape, packed_bits, target_bits)
 
             # load and reshape output
             output = np.load(out_npy_path)
@@ -603,13 +562,17 @@ def global_includes(self):
 
     # TODO check and add whatever missing
     def defines(self, var):
+        numReps = 1
         numInputVectors = list(self.get_nodeattr("numInputVectors"))
-        numReps = int(np.prod(numInputVectors))
+        total_spatial_size = int(np.prod(numInputVectors))
+
         self.code_gen_dict["$DEFINES$"] = [
-            """#define NumChannels1 {}\n #define PE1 {}\n #define numReps {}""".format(
+            """#define NumChannels1 {}\n #define PE1 {}\n #define numReps {}\n
+               #define ImgDim1 {}""".format(
                 self.get_nodeattr("NumChannels"),
                 self.get_nodeattr("PE"),
                 numReps,
+                total_spatial_size,
             )
         ]
         if self.get_nodeattr("mem_mode") == "decoupled":
@@ -617,8 +580,7 @@ def defines(self, var):
                 "#define ActVal1 %d" % self.get_nodeattr("ActVal")
             )
             self.code_gen_dict["$DEFINES$"].append(
-                "#define ThresType1 %s"
-                % self.get_weight_datatype().get_hls_datatype_str()
+                "#define ThresType1 %s" % self.get_weight_datatype().get_hls_datatype_str()
             )
             self.code_gen_dict["$DEFINES$"].append(
                 "#define NumSteps1 %d" % self.get_nodeattr("numSteps")
@@ -636,8 +598,15 @@ def read_npy_data(self):
         self.code_gen_dict["$READNPYDATA$"] = []
         # note: the innermost dim is reversed for the input
         self.code_gen_dict["$READNPYDATA$"].append(
-            'npy2apintstream<%s, %s, %d, %s>("%s", in0, false);'
-            % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in)
+            'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s, false);'
+            % (
+                packed_hls_type,
+                elem_hls_type,
+                elem_bits,
+                npy_type,
+                npy_in,
+                self.hls_sname(),
+            )
         )
         mem_mode = self.get_nodeattr("mem_mode")
         if mem_mode == "decoupled":
@@ -650,42 +619,50 @@ def read_npy_data(self):
             npy_in = "%s/thresholds.npy" % code_gen_dir
 
             self.code_gen_dict["$READNPYDATA$"].append(
-                'npy2apintstream<%s, %s, %d, %s>("%s", weights, false, numReps);'
-                % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in)
+                'npy2apintstream<%s, %s, %d, %s>("%s", weights_%s, false, ImgDim1);'
+                % (
+                    packed_hls_type,
+                    elem_hls_type,
+                    elem_bits,
+                    npy_type,
+                    npy_in,
+                    self.hls_sname(),
+                )
             )
 
     def strm_decl(self):
         self.code_gen_dict["$STREAMDECLARATIONS$"] = []
         self.code_gen_dict["$STREAMDECLARATIONS$"].append(
-            'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.get_instream_width())
+            'hls::stream<ap_uint<{}>> in0_{} ("in0_{}");'.format(
+                self.get_instream_width(), self.hls_sname(), self.hls_sname()
+            )
         )
         self.code_gen_dict["$STREAMDECLARATIONS$"].append(
-            'hls::stream<ap_uint<{}>> out ("out");'.format(self.get_outstream_width())
+            'hls::stream<ap_uint<{}>> out_{} ("out_{}");'.format(
+                self.get_outstream_width(), self.hls_sname(), self.hls_sname()
+            )
         )
         mem_mode = self.get_nodeattr("mem_mode")
         if mem_mode == "decoupled":
             self.code_gen_dict["$STREAMDECLARATIONS$"].append(
-                'hls::stream<ap_uint<{}>> weights ("weights");'.format(
-                    self.get_weightstream_width()
+                'hls::stream<ap_uint<{}>> weights_{} ("weights_{}");'.format(
+                    self.get_weightstream_width(), self.hls_sname(), self.hls_sname()
                 )
             )
 
     def docompute(self):
         tmpl_args = self.get_template_param_values()
-        # TODO: why put some template parameters into defines and not others?
-        # should ImgDim be defined or just filled in here like we do now?
         node = self.onnx_node
-        inp_vecs = self.get_nodeattr("numInputVectors")
-        total_spatial_size = int(np.prod(inp_vecs))
         mem_mode = self.get_nodeattr("mem_mode")
         if mem_mode == "const":
             self.code_gen_dict["$DOCOMPUTE$"] = [
-                """{}<{}, NumChannels1, PE1, {}, {}>
-                (in0, out, threshs, numReps);""".format(
+                """{}<ImgDim1, NumChannels1, PE1, {}, {}>
+                (in0_{}, out_{}, threshs, numReps);""".format(
                     node.op_type,
-                    total_spatial_size,
                     tmpl_args["TSrcI"],
                     tmpl_args["TDstI"],
+                    self.hls_sname(),
+                    self.hls_sname(),
                 )
             ]
         elif mem_mode == "decoupled":
@@ -693,12 +670,14 @@ def docompute(self):
             # - for cppsim the repetition comes from the threshold stream reader+input
             # - for synth the unit runs continuously anyway (ap_ctrl_none)
             self.code_gen_dict["$DOCOMPUTE$"] = [
-                """{}<{}, NumChannels1, PE1, {}, {}, ActVal1, ThresType1, NumSteps1>
-                (in0, out, weights, 1);""".format(
+                """{}<ImgDim1, NumChannels1, PE1, {}, {}, ActVal1, ThresType1, NumSteps1>
+                (in0_{}, out_{}, weights_{}, numReps);""".format(
                     "Thresholding_Stream_Batch",
-                    total_spatial_size,
                     tmpl_args["TSrcI"],
                     tmpl_args["TDstI"],
+                    self.hls_sname(),
+                    self.hls_sname(),
+                    self.hls_sname(),
                 )
             ]
         else:
@@ -721,12 +700,13 @@ def dataoutstrm(self):
 
         # note: the innermost dim is not reversed for the output
         self.code_gen_dict["$DATAOUTSTREAM$"] = [
-            'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s", false);'
+            'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s", false);'
             % (
                 packed_hls_type,
                 elem_hls_type,
                 elem_bits,
                 npy_type,
+                self.hls_sname(),
                 shape_cpp_str,
                 npy_out,
             )
@@ -738,24 +718,29 @@ def save_as_npy(self):
     def blackboxfunction(self):
         if self.get_nodeattr("mem_mode") == "const":
             self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
-                """void {}(hls::stream<ap_uint<{}>> &in0,
-                    hls::stream<ap_uint<{}>> &out
+                """void {}(hls::stream<ap_uint<{}>> &in0_{},
+                    hls::stream<ap_uint<{}>> &out_{}
                     )""".format(
                     self.onnx_node.name,
                     self.get_instream_width(),
+                    self.hls_sname(),
                     self.get_outstream_width(),
+                    self.hls_sname(),
                 )
             ]
         elif self.get_nodeattr("mem_mode") == "decoupled":
             self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
-                """void {}(hls::stream<ap_uint<{}>> &in0,
-                    hls::stream<ap_uint<{}>> &weights,
-                    hls::stream<ap_uint<{}>> &out
+                """void {}(hls::stream<ap_uint<{}>> &in0_{},
+                    hls::stream<ap_uint<{}>> &weights_{},
+                    hls::stream<ap_uint<{}>> &out_{}
                     )""".format(
                     self.onnx_node.name,
                     self.get_instream_width(),
+                    self.hls_sname(),
                     self.get_weightstream_width(),
+                    self.hls_sname(),
                     self.get_outstream_width(),
+                    self.hls_sname(),
                 )
             ]
         else:
@@ -763,30 +748,22 @@ def blackboxfunction(self):
 
     def pragmas(self):
         self.code_gen_dict["$PRAGMAS$"] = [
-            "#pragma HLS INTERFACE axis port=in0 name=in0_" + self.hls_sname()
+            "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname()
         ]
         self.code_gen_dict["$PRAGMAS$"].append(
-            "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname()
-        )
-        self.code_gen_dict["$PRAGMAS$"].append(
-            "#pragma HLS INTERFACE ap_ctrl_none port=return"
+            "#pragma HLS INTERFACE axis port=out_" + self.hls_sname()
         )
+        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return")
 
         if self.get_nodeattr("mem_mode") == "const":
             # the threshold tensor is acc_type [PE][TMEM][N_THRES]
             # partition for parallel access along PE and N_THRES
             # dimensions (dims 1 and 3)
             self.code_gen_dict["$PRAGMAS$"].append(
-                (
-                    "#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds "
-                    "complete dim=1"
-                )
+                ("#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds " "complete dim=1")
             )
             self.code_gen_dict["$PRAGMAS$"].append(
-                (
-                    "#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds "
-                    "complete dim=3"
-                )
+                ("#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds " "complete dim=3")
             )
             # set resource type
             ram_style = self.get_nodeattr("ram_style")
@@ -797,17 +774,11 @@ def pragmas(self):
             if pe < ich:
                 if ram_style == "distributed":
                     self.code_gen_dict["$PRAGMAS$"].append(
-                        (
-                            "#pragma HLS RESOURCE variable=threshs.m_thresholds "
-                            "core=ROM_2P_LUTRAM"
-                        )
+                        ("#pragma HLS RESOURCE variable=threshs.m_thresholds " "core=ROM_2P_LUTRAM")
                     )
                 elif ram_style == "block":
                     self.code_gen_dict["$PRAGMAS$"].append(
-                        (
-                            "#pragma HLS RESOURCE variable=threshs.m_thresholds "
-                            "core=ROM_2P_BRAM"
-                        )
+                        ("#pragma HLS RESOURCE variable=threshs.m_thresholds " "core=ROM_2P_BRAM")
                     )
                 else:
                     raise Exception(
@@ -818,8 +789,7 @@ def pragmas(self):
                     )
         elif self.get_nodeattr("mem_mode") == "decoupled":
             self.code_gen_dict["$PRAGMAS$"].append(
-                "#pragma HLS INTERFACE axis port=weights name=weights_"
-                + self.hls_sname()
+                "#pragma HLS INTERFACE axis port=weights_" + self.hls_sname()
             )
 
     def code_generation_ipi(self):
@@ -840,8 +810,7 @@ def code_generation_ipi(self):
             cmd.append("create_bd_pin -dir I -type rst /%s/%s" % (node_name, rst_name))
             cmd.append(
                 "create_bd_intf_pin -mode Master "
-                "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s"
-                % (node_name, dout_name)
+                "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, dout_name)
             )
             cmd.append(
                 "create_bd_intf_pin -mode Slave "
@@ -853,30 +822,23 @@ def code_generation_ipi(self):
                 % (self.get_nodeattr("ip_vlnv"), node_name, node_name)
             )
             # instantiate a streamer and connect it to the HLS IP
-            strm_vlnv = "xilinx.com:user:memstream:1.0"
+            strm_vlnv = "amd.com:finn:memstream:1.0"
             strm_inst = node_name + "_wstrm"
             cmd.append(
-                "create_bd_cell -type ip -vlnv %s /%s/%s"
-                % (strm_vlnv, node_name, strm_inst)
+                "create_bd_cell -type ip -vlnv %s /%s/%s" % (strm_vlnv, node_name, strm_inst)
             )
             cmd.append(
                 "set_property -dict [list "
-                "CONFIG.NSTREAMS {1} "
-                "CONFIG.MEM_DEPTH {%d} "
-                "CONFIG.MEM_WIDTH {%d} "
-                "CONFIG.MEM_INIT {%s} "
+                "CONFIG.DEPTH {%d} "
+                "CONFIG.WIDTH {%d} "
+                "CONFIG.INIT_FILE {%s} "
                 "CONFIG.RAM_STYLE {%s} "
-                "CONFIG.STRM0_DEPTH {%d} "
-                "CONFIG.STRM0_WIDTH {%d} "
-                "CONFIG.STRM0_OFFSET {0} "
                 "] [get_bd_cells /%s/%s]"
                 % (
                     self.calc_tmem(),
                     self.get_weightstream_width_padded(),
-                    self.get_nodeattr("code_gen_dir_ipgen") + "/",
+                    self.get_nodeattr("code_gen_dir_ipgen") + "/memblock.dat",
                     self.get_nodeattr("ram_style"),
-                    self.calc_tmem(),
-                    self.get_weightstream_width_padded(),
                     node_name,
                     strm_inst,
                 )
@@ -887,11 +849,11 @@ def code_generation_ipi(self):
                 % (node_name, strm_inst, node_name, node_name, sname)
             )
             cmd.append(
-                "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/aresetn]"
+                "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/ap_rst_n]"
                 % (node_name, rst_name, node_name, strm_inst)
             )
             cmd.append(
-                "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/aclk]"
+                "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/ap_clk]"
                 % (node_name, clk_name, node_name, strm_inst)
             )
             cmd.append(
@@ -917,8 +879,7 @@ def code_generation_ipi(self):
                 axilite_name = self.get_verilog_top_module_intf_names()["axilite"][0]
                 cmd.append(
                     "create_bd_intf_pin -mode Slave "
-                    "-vlnv xilinx.com:interface:aximm_rtl:1.0 /%s/%s"
-                    % (node_name, axilite_name)
+                    "-vlnv xilinx.com:interface:aximm_rtl:1.0 /%s/%s" % (node_name, axilite_name)
                 )
                 cmd.append(
                     "connect_bd_intf_net [get_bd_intf_pins %s/%s] "
@@ -960,3 +921,18 @@ def ipgen_extra_directives(self):
         "Return a list of extra tcl directives for HLS synthesis."
 
         return ["config_compile -pipeline_style frp"]
+
+    def derive_characteristic_fxns(self, period):
+        n_inps = np.prod(self.get_folded_input_shape()[:-1])
+        io_dict = {
+            "inputs": {
+                "in0": [0 for i in range(n_inps)],
+            },
+            "outputs": {"out": []},
+        }
+        mem_mode = self.get_nodeattr("mem_mode")
+        if mem_mode in ["decoupled", "external"]:
+            n_weight_inps = self.calc_tmem()
+            num_w_reps = np.prod(self.get_nodeattr("numInputVectors"))
+            io_dict["inputs"]["weights"] = [0 for i in range(num_w_reps * n_weight_inps)]
+        super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict)
diff --git a/src/finn/custom_op/fpgadataflow/tlastmarker.py b/src/finn/custom_op/fpgadataflow/tlastmarker.py
index 7386aa7e63..9309841b2e 100644
--- a/src/finn/custom_op/fpgadataflow/tlastmarker.py
+++ b/src/finn/custom_op/fpgadataflow/tlastmarker.py
@@ -37,8 +37,8 @@ class TLastMarker(HLSCustomOp):
     (needed by the FINN PYNQ shell) or at the beginning to remove the end-of-burst
     from DMA read."""
 
-    def __init__(self, onnx_node):
-        super().__init__(onnx_node)
+    def __init__(self, onnx_node, **kwargs):
+        super().__init__(onnx_node, **kwargs)
 
     def get_nodeattr_types(self):
         my_attrs = {
@@ -130,9 +130,9 @@ def docompute(self):
             self.code_gen_dict["$DOCOMPUTE$"] = [
                 "for(unsigned int i=0; i<NumItersPerImg; i++) {",
                 "#pragma HLS PIPELINE II=1",
-                "out.write(in0.read().get_data());"
+                "out_%s.write(in0_%s.read().get_data());" % (self.hls_sname(), self.hls_sname())
                 if use_qdma_axis
-                else "out.write(in0.read().data);",
+                else "out_%s.write(in0_%s.read().data);" % (self.hls_sname(), self.hls_sname()),
                 "}",
             ]
 
@@ -146,17 +146,21 @@ def docompute(self):
                 "#pragma HLS protocol fixed",
                 "// do a first read from stream before we decide on numIters",
                 "// giving software a chance to set up the numIters prior to startup",
-                "t.set_data(in0.read());" if use_qdma_axis else "t.data = in0.read();",
+                "t.set_data(in0_%s.read());" % self.hls_sname()
+                if use_qdma_axis
+                else "t.data = in0_%s.read();" % self.hls_sname(),
                 "n = (numIters == 0 ? NumItersPerImg : numIters);",
                 "t.set_last(n==1);" if use_qdma_axis else "t.last = (n==1);",
-                "out.write(t);",
+                "out_%s.write(t);" % self.hls_sname(),
                 "} // end of cycle accurate region",
                 "// do one less iteration than spec since we already did one",
                 "for(unsigned int i=1; i<n; i++) {",
                 "#pragma HLS PIPELINE II=1",
-                "t.set_data(in0.read());" if use_qdma_axis else "t.data = in0.read();",
+                "t.set_data(in0_%s.read());" % self.hls_sname()
+                if use_qdma_axis
+                else "t.data = in0_%s.read();" % self.hls_sname(),
                 "t.set_last(i==(n-1));" if use_qdma_axis else "t.last = (i==(n-1));",
-                "out.write(t);",
+                "out_%s.write(t);" % self.hls_sname(),
                 "}",
             ]
 
@@ -168,11 +172,13 @@ def docompute(self):
                 "t.set_keep(-1);" if use_qdma_axis else "t.keep = -1;",
                 "for(unsigned int i=0; i<NumItersPerImg; i++) {",
                 "#pragma HLS PIPELINE II=1",
-                "t.set_data(in0.read());" if use_qdma_axis else "t.data = in0.read();",
+                "t.set_data(in0_%s.read());" % self.hls_sname()
+                if use_qdma_axis
+                else "t.data = in0_%s.read();" % self.hls_sname(),
                 "t.set_last(i==(NumItersPerImg-1));"
                 if use_qdma_axis
                 else "t.last = (i==(NumItersPerImg-1));",
-                "out.write(t);",
+                "out_%s.write(t);" % self.hls_sname(),
                 "}",
             ]
 
@@ -187,22 +193,23 @@ def blackboxfunction(self):
 
         if dyn_iters == 1:
             self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
-                """void %s(hls::stream<InDType> &in0,
-                    hls::stream<OutDType> &out, unsigned int numIters)"""
-                % self.onnx_node.name
+                """void %s(hls::stream<InDType> &in0_%s,
+                    hls::stream<OutDType> &out_%s, unsigned int numIters)"""
+                % (self.onnx_node.name, self.hls_sname(), self.hls_sname())
             ]
         else:
             self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
-                """void %s(hls::stream<InDType> &in0, hls::stream<OutDType> &out)"""
-                % self.onnx_node.name
+                """void %s(hls::stream<InDType> &in0_%s,
+                hls::stream<OutDType> &out_%s)"""
+                % (self.onnx_node.name, self.hls_sname(), self.hls_sname())
             ]
 
     def pragmas(self):
         self.code_gen_dict["$PRAGMAS$"] = [
-            "#pragma HLS INTERFACE axis port=in0 name=in0_" + self.hls_sname()
+            "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname()
         ]
         self.code_gen_dict["$PRAGMAS$"].append(
-            "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname()
+            "#pragma HLS INTERFACE axis port=out_" + self.hls_sname()
         )
 
         dyn_iters = self.get_nodeattr("DynIters")
@@ -211,38 +218,36 @@ def pragmas(self):
                 "#pragma HLS INTERFACE s_axilite port=numIters bundle=control"
             )
 
-        self.code_gen_dict["$PRAGMAS$"].append(
-            "#pragma HLS INTERFACE ap_ctrl_none port=return"
-        )
+        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return")
 
     def get_number_output_values(self):
         return self.get_nodeattr("NumIters")
 
-    def get_folded_input_shape(self):
+    def get_folded_input_shape(self, ind=0):
         stream_width = self.get_nodeattr("StreamWidth")
         elem_width = self.get_nodeattr("ElemWidth")
         n_packed_elems = stream_width // elem_width
         n_iters = self.get_nodeattr("NumIters")
         return (1, n_iters, n_packed_elems)
 
-    def get_folded_output_shape(self):
+    def get_folded_output_shape(self, ind=0):
         return self.get_folded_input_shape()
 
-    def get_instream_width(self):
+    def get_instream_width(self, ind=0):
         stream_width = self.get_nodeattr("StreamWidth")
         return stream_width
 
-    def get_outstream_width(self):
+    def get_outstream_width(self, ind=0):
         stream_width = self.get_nodeattr("StreamWidth")
         return stream_width
 
     def strm_decl(self):
         self.code_gen_dict["$STREAMDECLARATIONS$"] = []
         self.code_gen_dict["$STREAMDECLARATIONS$"].append(
-            'hls::stream<InDType> in0 ("in0");'
+            'hls::stream<InDType> in0_%s ("in0_%s");' % (self.hls_sname(), self.hls_sname())
         )
         self.code_gen_dict["$STREAMDECLARATIONS$"].append(
-            'hls::stream<OutDType> out ("out");'
+            'hls::stream<OutDType> out_%s ("out_%s");' % (self.hls_sname(), self.hls_sname())
         )
 
     def get_verilog_top_module_intf_names(self):
diff --git a/src/finn/custom_op/fpgadataflow/upsampler.py b/src/finn/custom_op/fpgadataflow/upsampler.py
index b62e4f2f67..9c0db1f3df 100644
--- a/src/finn/custom_op/fpgadataflow/upsampler.py
+++ b/src/finn/custom_op/fpgadataflow/upsampler.py
@@ -27,7 +27,6 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import numpy as np
-import os
 import warnings
 from qonnx.core.datatype import DataType
 
@@ -42,8 +41,8 @@ class UpsampleNearestNeighbour_Batch(HLSCustomOp):
     The layer expects square feature maps for the in and output.
     """
 
-    def __init__(self, onnx_node):
-        super().__init__(onnx_node)
+    def __init__(self, onnx_node, **kwargs):
+        super().__init__(onnx_node, **kwargs)
 
     def get_nodeattr_types(self):
         my_attrs = {
@@ -57,6 +56,8 @@ def get_nodeattr_types(self):
             "inputDataType": ("s", True, ""),
             # Batch size
             "numInputVectors": ("i", False, 1),
+            # Dimensionality mode: 0 = 2D square, 1 = 1D in H dim
+            "DimMode": ("i", False, 0),
         }
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs
@@ -64,28 +65,41 @@ def get_nodeattr_types(self):
     def get_exp_cycles(self):
         OFMDim = self.get_nodeattr("OFMDim")
         batch_size = self.get_nodeattr("numInputVectors")
-        exp_cycles = OFMDim * OFMDim * batch_size
+        is_2d = self.get_nodeattr("DimMode") == 0
+        reps = 1
+        if is_2d:
+            OFMDim = OFMDim * OFMDim
+            reps = batch_size
+        exp_cycles = OFMDim * reps
         return int(exp_cycles)
 
-    def get_normal_input_shape(self):
+    def get_normal_input_shape(self, ind=0):
         IFMDim = self.get_nodeattr("IFMDim")
         num_ch = self.get_nodeattr("NumChannels")
         batch = self.get_nodeattr("numInputVectors")
-        ishape = (batch, IFMDim, IFMDim, num_ch)
+        is_2d = self.get_nodeattr("DimMode") == 0
+        if is_2d:
+            ishape = (batch, IFMDim, IFMDim, num_ch)
+        else:
+            ishape = (batch, IFMDim, 1, num_ch)
         return ishape
 
-    def get_normal_output_shape(self):
+    def get_normal_output_shape(self, ind=0):
         OFMDim = self.get_nodeattr("OFMDim")
         num_ch = self.get_nodeattr("NumChannels")
         batch = self.get_nodeattr("numInputVectors")
-        oshape = (batch, OFMDim, OFMDim, num_ch)
+        is_2d = self.get_nodeattr("DimMode") == 0
+        if is_2d:
+            oshape = (batch, OFMDim, OFMDim, num_ch)
+        else:
+            oshape = (batch, OFMDim, 1, num_ch)
         return oshape
 
-    def get_folded_input_shape(self):
+    def get_folded_input_shape(self, ind=0):
         normal_ishape = list(self.get_normal_input_shape())
         return tuple(normal_ishape)
 
-    def get_folded_output_shape(self):
+    def get_folded_output_shape(self, ind=0):
         normal_oshape = list(self.get_normal_output_shape())
         return tuple(normal_oshape)
 
@@ -93,9 +107,7 @@ def make_shape_compatible_op(self, model):
         exp_ishape = self.get_normal_input_shape()
         oshape = self.get_normal_output_shape()
         ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0]))
-        assert (
-            ishape == exp_ishape
-        ), "Unexpect input shape for UpsampleNearestNeighbour_Batch."
+        assert ishape == exp_ishape, "Unexpect input shape for UpsampleNearestNeighbour_Batch."
         return super().make_const_shape_op(oshape)
 
     def infer_node_datatype(self, model):
@@ -115,21 +127,21 @@ def infer_node_datatype(self, model):
     def verify_node(self):
         pass
 
-    def get_input_datatype(self):
+    def get_input_datatype(self, ind=0):
         """Returns FINN DataType of input."""
         ret = DataType[self.get_nodeattr("inputDataType")]
         return ret
 
-    def get_output_datatype(self):
+    def get_output_datatype(self, ind=0):
         """Returns FINN DataType of output. (Same as input datatype)"""
         return self.get_input_datatype()
 
-    def get_instream_width(self):
+    def get_instream_width(self, ind=0):
         ibits = self.get_input_datatype().bitwidth()
         ifm_ch = self.get_nodeattr("NumChannels")
         return ibits * ifm_ch
 
-    def get_outstream_width(self):
+    def get_outstream_width(self, ind=0):
         obits = self.get_output_datatype().bitwidth()
         ifm_ch = self.get_nodeattr("NumChannels")
         return obits * ifm_ch
@@ -173,24 +185,46 @@ def read_npy_data(self):
         npy_in = "%s/input_0.npy" % code_gen_dir
         self.code_gen_dict["$READNPYDATA$"] = []
         self.code_gen_dict["$READNPYDATA$"].append(
-            'npy2apintstream<%s, %s, %d, %s>("%s", in0);'
-            % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in)
+            'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s);'
+            % (
+                packed_hls_type,
+                elem_hls_type,
+                elem_bits,
+                npy_type,
+                npy_in,
+                self.hls_sname(),
+            )
         )
 
     def strm_decl(self):
         self.code_gen_dict["$STREAMDECLARATIONS$"] = []
         self.code_gen_dict["$STREAMDECLARATIONS$"].append(
-            'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.get_instream_width())
+            'hls::stream<ap_uint<{}>> in0_{} ("in0_{}");'.format(
+                self.get_instream_width(), self.hls_sname(), self.hls_sname()
+            )
         )
         self.code_gen_dict["$STREAMDECLARATIONS$"].append(
-            'hls::stream<ap_uint<{}>> out ("out");'.format(self.get_outstream_width())
+            'hls::stream<ap_uint<{}>> out_{} ("out_{}");'.format(
+                self.get_outstream_width(), self.hls_sname(), self.hls_sname()
+            )
         )
 
     def docompute(self):
-        self.code_gen_dict["$DOCOMPUTE$"] = [
-            """UpsampleNearestNeighbour_Batch<OFMDim, IFMDim, IFMChannels,
-            ap_uint<Input_precision> > (in0, out, numReps);"""
-        ]
+        is_2d = self.get_nodeattr("DimMode") == 0
+        batch = self.get_nodeattr("numInputVectors")
+        if is_2d:
+            self.code_gen_dict["$DOCOMPUTE$"] = [
+                """UpsampleNearestNeighbour_Batch<OFMDim, IFMDim, IFMChannels,
+                ap_uint<Input_precision> > (in0_%s, out_%s, numReps);"""
+                % (self.hls_sname(), self.hls_sname())
+            ]
+        else:
+            assert batch == 1, "1D upsampler currently needs numReps=1"
+            self.code_gen_dict["$DOCOMPUTE$"] = [
+                """UpsampleNearestNeighbour_1D<OFMDim, IFMDim, IFMChannels,
+                ap_uint<Input_precision> > (in0_%s, out_%s);"""
+                % (self.hls_sname(), self.hls_sname())
+            ]
 
     def dataoutstrm(self):
         code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
@@ -208,12 +242,13 @@ def dataoutstrm(self):
         oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}")
 
         self.code_gen_dict["$DATAOUTSTREAM$"] = [
-            'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s");'
+            'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s");'
             % (
                 packed_hls_type,
                 elem_hls_type,
                 elem_bits,
                 npy_type,
+                self.hls_sname(),
                 oshape_cpp_str,
                 npy_out,
             )
@@ -226,27 +261,30 @@ def blackboxfunction(self):
         packed_bits = self.get_instream_width()
         packed_hls_type = "ap_uint<%d>" % packed_bits
         self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
-            "void %s(hls::stream<%s > &in0, hls::stream<%s > &out)"
-            % (self.onnx_node.name, packed_hls_type, packed_hls_type)
+            "void %s(hls::stream<%s > &in0_%s, hls::stream<%s > &out_%s)"
+            % (
+                self.onnx_node.name,
+                packed_hls_type,
+                self.hls_sname(),
+                packed_hls_type,
+                self.hls_sname(),
+            )
         ]
 
     def pragmas(self):
         self.code_gen_dict["$PRAGMAS$"] = [
-            "#pragma HLS INTERFACE axis port=in0 name=in0_" + self.hls_sname()
+            "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname()
         ]
         self.code_gen_dict["$PRAGMAS$"].append(
-            "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname()
-        )
-        self.code_gen_dict["$PRAGMAS$"].append(
-            "#pragma HLS INTERFACE ap_ctrl_none port=return"
+            "#pragma HLS INTERFACE axis port=out_" + self.hls_sname()
         )
+        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return")
 
     def execute_node(self, context, graph):
         mode = self.get_nodeattr("exec_mode")
         node = self.onnx_node
         exp_ishape = self.get_normal_input_shape()
         exp_oshape = self.get_normal_output_shape()
-        folded_ishape = self.get_folded_input_shape()
         folded_oshape = self.get_folded_output_shape()
 
         if mode == "cppsim":
@@ -268,9 +306,7 @@ def execute_node(self, context, graph):
         ), """Input shape doesn't
         match expected shape (numInputVectors, ImgDim, ImgDim, NumChannels)."""
         export_idt = self.get_input_datatype()
-
-        reshaped_input = inp.reshape(folded_ishape)
-        np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input)
+        self.dynamic_input_to_npy(context, 1, target_dir=code_gen_dir)
 
         if mode == "cppsim":
             # execute the precompiled model
diff --git a/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py b/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py
index 27b23dd328..bd5bb75f1d 100644
--- a/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py
+++ b/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py
@@ -29,6 +29,7 @@
 import math
 import numpy as np
 import os
+import textwrap
 import warnings
 from qonnx.core.datatype import DataType
 from qonnx.util.basic import (
@@ -41,6 +42,7 @@
 from finn.util.data_packing import (
     npy_to_rtlsim_input,
     numpy_to_hls_code,
+    pack_innermost_dim_as_hex_string,
     rtlsim_output_to_npy,
 )
 
@@ -48,12 +50,13 @@
 class VectorVectorActivation(HLSCustomOp):
     """Class that corresponds to finn-hlslib Vector_Vector_Activate_Batch function"""
 
-    def __init__(self, onnx_node):
-        super().__init__(onnx_node)
+    def __init__(self, onnx_node, **kwargs):
+        super().__init__(onnx_node, **kwargs)
 
     def get_nodeattr_types(self):
         my_attrs = {
             "PE": ("i", True, 0),
+            "SIMD": ("i", False, 1),
             "Dim": ("ints", True, []),  # [H, W]
             "Channels": ("i", True, 0),
             "Kernel": ("ints", True, []),  # [H, W]
@@ -67,82 +70,147 @@ def get_nodeattr_types(self):
             "accDataType": ("s", False, "INT32"),
             # no-activation mode (produce accumulators)
             "noActivation": ("i", False, 0, {0, 1}),
+            # memory mode for the layer weights
+            # const -- embedded weights, default, long compile/synth times
+            # decoupled -- streaming weights with weight streamer packaged inside IP
+            # external -- streaming weights with external streamer
+            "mem_mode": ("s", False, "const", {"const", "decoupled", "external"}),
+            # (mem_mode = decoupled only) whether weights will be writable through
+            # an AXI-lite interface during runtime
+            # 1 for enabled, 0 for disabled.
+            # see finn-rtllib/memstream/doc/README for more about the memory
+            # address map used for writable weights
+            # IMPORTANT: After using AXI lite to either read or write the weights,
+            # always "flush" the accelerator by first passing a dummy input
+            # vector through the accelerator. This will get rid of any old
+            # weight data from the weight FIFOs.
+            "runtime_writeable_weights": ("i", False, 0, {0, 1}),
+            # FPGA resource type for memories in decoupled mode
+            # auto -- let Vivado decide
+            # block -- use BRAM
+            # distributed -- use LUTRAM
+            # ultra -- use UltraRAM (URAM), must have runtime_writeable_weights=1
+            # see also https://www.xilinx.com/support/answers/38070.html
+            "ram_style": (
+                "s",
+                False,
+                "auto",
+                {"auto", "block", "distributed", "ultra"},
+            ),
+            # use xnor-popcount for binary weights/inputs, thus treating them
+            # as bipolar
+            "binaryXnorMode": ("i", False, 0, {0, 1}),
         }
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs
 
     def minimize_accumulator_width(self, model):
+        """Minimize the accumulator bit width according to the weight values,
+        input data types, and size of dot product"""
         weights = model.get_initializer(self.onnx_node.input[1])
         k_h, k_w = self.get_nodeattr("Kernel")
         fm = self.get_nodeattr("Channels")
         # put weights into the shape expected by calculate_matvec_accumulator_range
         weights = weights.reshape(fm, k_h * k_w).transpose()
+        # since in the calculation the values of the weight matrix are used,
+        # for the bipolar case they need to be converted to bipolar
+        if self.get_nodeattr("binaryXnorMode"):
+            weights = 2 * weights - 1
         if len(self.onnx_node.input) > 2:
             thresholds = model.get_initializer(self.onnx_node.input[2])
         else:
             thresholds = None
         idt = self.get_input_datatype()
-        # calculate minimum and maximum values of accumulator
+
         (acc_min, acc_max) = calculate_matvec_accumulator_range(weights, idt)
+        # if runtime-writeable weights, then the values of the weights can
+        # change and we need to use the worst-case values from the datatypes
+        if self.get_nodeattr("runtime_writeable_weights"):
+            wdt = self.get_weight_datatype()
+            lower_worst = wdt.min() * np.ones_like(weights)
+            lower_range = calculate_matvec_accumulator_range(lower_worst, idt)
+            upper_worst = wdt.max() * np.ones_like(weights)
+            upper_range = calculate_matvec_accumulator_range(upper_worst, idt)
+            acc_min = min(min(lower_range), min(upper_range))
+            acc_max = max(max(upper_range), max(upper_range))
+
+        # if the thresholds can be used to determine range, then adjust the range
+        # according to the known values of the thresholds
         if thresholds is not None:
             threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds)
             # set threshold datatype (and accumulator datatype implicitly)
             min_threshold = thresholds.min()
             max_threshold = thresholds.max()
             # clip threshold values
-            clip_upper = None
-            clip_lower = None
-            if max_threshold > acc_max + 1:
-                clip_upper = acc_max + 1
-            if min_threshold < acc_min:
-                clip_lower = acc_min
-            if (clip_lower is not None) or (clip_upper is not None):
+            if max_threshold > acc_max or min_threshold < acc_min:
                 warnings.warn("Clipping some thresholds in %s" % self.onnx_node.name)
-                thresholds = np.clip(thresholds, clip_lower, clip_upper)
+                thresholds = np.clip(thresholds, acc_min, acc_max)
                 model.set_initializer(self.onnx_node.input[2], thresholds)
                 threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds)
                 min_threshold = thresholds.min()
                 max_threshold = thresholds.max()
-            # get range required by threshold values
-            tdt_min = min(acc_min, min_threshold)
-            tdt_max = max(acc_max, max_threshold)
-            if tdt_min < 0:
-                if abs(tdt_min) > tdt_max:
-                    tdt = DataType.get_smallest_possible(tdt_min)
-                else:
-                    tdt = DataType.get_smallest_possible(-tdt_max - 1)
-            else:
-                tdt = DataType.get_smallest_possible(tdt_max)
-            assert np.vectorize(tdt.allowed)(
+            acc_min = min(min_threshold, acc_min)
+            acc_max = max(max_threshold, acc_max)
+
+        # if the acc_range is always greater than 0, then acc_max <= 2^P - 1
+        if acc_min >= 0:
+            acc_bit_width = np.log2(acc_max + 1)
+            acc_bit_width = math.ceil(acc_bit_width)
+            adt = DataType[f"UINT{acc_bit_width}"]
+        # if the acc_range is signed, then acc_min >= -2^{P-1} and acc_max <=
+        # 2^{P - 1} - 1, which means 2^{P - 1} >= max(-acc_min, 1 + acc_max)
+        else:
+            _acc_max = max(-acc_min, 1 + acc_max)
+            acc_bit_width = np.log2(_acc_max) + 1
+            acc_bit_width = math.ceil(acc_bit_width)
+            adt = DataType[f"INT{acc_bit_width}"]
+
+        # if activation, assert that the thresholds can be expressed with adt
+        if thresholds is not None:
+            assert np.vectorize(adt.allowed)(
                 threshold_tensor
             ).all(), "Thresholds in %s can't be expressed with type %s" % (
                 self.onnx_node.name,
-                str(tdt),
+                str(adt),
             )
-            self.set_nodeattr("accDataType", tdt.name)
-        else:
-            if acc_min < 0:
-                if abs(acc_min) > acc_max:
-                    adt = DataType.get_smallest_possible(acc_min)
-                else:
-                    adt = DataType.get_smallest_possible(-acc_max - 1)
-            else:
-                adt = DataType.get_smallest_possible(acc_max)
-            # ensure a datatype divisible by 8-bits in case this is the last node
-            bw = roundup_to_integer_multiple(adt.bitwidth(), 8)
-            new_adt_name = adt.name.replace(str(adt.bitwidth()), str(bw))
-            adt = DataType[new_adt_name]
-            self.set_nodeattr("accDataType", adt.name)
+
+        # if no activation, output and accumulator datatypes are the same
+        if self.get_nodeattr("noActivation"):
+            # if this is the last node in the graph, then ensure the datatype is
+            # divisibly by 8 bits
+            if model.find_direct_successors(self.onnx_node) is None:
+                bw = roundup_to_integer_multiple(adt.bitwidth(), 8)
+                new_adt_name = adt.name.replace(str(adt.bitwidth()), str(bw))
+                adt = DataType[new_adt_name]
             # for no-activation nodes, output dt = acc dt
             self.set_nodeattr("outputDataType", adt.name)
+        self.set_nodeattr("accDataType", adt.name)
+
         return DataType[self.get_nodeattr("accDataType")]
 
+    def minimize_weight_bit_width(self, model):
+        """Minimize the bit width based on the values of the weights"""
+        if not self.get_nodeattr("runtime_writeable_weights"):
+            weights = model.get_initializer(self.onnx_node.input[1])
+            w_min = weights.min()
+            w_max = weights.max()
+            if w_min < 0:
+                if abs(w_min) > w_max:
+                    wdt = DataType.get_smallest_possible(w_min)
+                else:
+                    wdt = DataType.get_smallest_possible(-w_max - 1)
+            else:
+                wdt = DataType.get_smallest_possible(w_max)
+            self.set_nodeattr("weightDataType", wdt.name)
+        return DataType[self.get_nodeattr("weightDataType")]
+
     def calc_wmem(self):
         """Calculates and returns WMEM."""
         ch = self.get_nodeattr("Channels")
         k_h, k_w = self.get_nodeattr("Kernel")
         pe = self.get_nodeattr("PE")
-        wmem = k_h * k_w * ch // pe
+        simd = self.get_nodeattr("SIMD")
+        wmem = (k_h * k_w * ch // pe) // simd
         return wmem
 
     def calc_tmem(self):
@@ -176,7 +244,7 @@ def infer_node_datatype(self, model):
     def verify_node(self):
         pass
 
-    def get_input_datatype(self):
+    def get_input_datatype(self, ind=0):
         """Returns FINN DataType of input."""
         return DataType[self.get_nodeattr("inputDataType")]
 
@@ -184,31 +252,50 @@ def get_weight_datatype(self):
         """Returns FINN DataType of weights."""
         return DataType[self.get_nodeattr("weightDataType")]
 
-    def get_output_datatype(self):
+    def get_accumulator_datatype(self):
+        """Returns FINN DataType of accumulator"""
+        return DataType[self.get_nodeattr("accDataType")]
+
+    def get_output_datatype(self, ind=0):
         """Returns FINN DataType of output."""
         return DataType[self.get_nodeattr("outputDataType")]
 
-    def get_instream_width(self):
+    def get_instream_width(self, ind=0):
         i_bits = self.get_input_datatype().bitwidth()
-        in_width = i_bits * self.get_nodeattr("PE")
+        simd = self.get_nodeattr("SIMD")
+        pe = self.get_nodeattr("PE")
+        in_width = i_bits * simd * pe
         return in_width
 
-    def get_outstream_width(self):
+    def get_outstream_width(self, ind=0):
         o_bits = self.get_output_datatype().bitwidth()
         out_width = o_bits * self.get_nodeattr("PE")
         return out_width
 
-    def get_folded_input_shape(self):
+    def get_folded_input_shape(self, ind=0):
         k_h, k_w = self.get_nodeattr("Kernel")
-        sf = k_h * k_w
         dim_h, dim_w = self.get_nodeattr("Dim")
         ch = self.get_nodeattr("Channels")
+        simd = self.get_nodeattr("SIMD")
         pe = self.get_nodeattr("PE")
+        kernel_2 = k_h * k_w
+        assert kernel_2 % simd == 0, "Requirement kernel (k_h * k_w) divisable by SIMD is violated."
+        sf = kernel_2 // simd
+        assert ch % pe == 0, "Requirement Channels divisable by PE is violated."
         nf = ch // pe
-        folded_input_shape = tuple([1, dim_h, dim_w, sf * nf, pe])
+
+        if ind == 0:
+            # calculate shape of input 0
+            folded_input_shape = tuple([1, dim_h, dim_w, sf * nf, simd * pe])
+        elif ind == 1 and self.get_nodeattr("mem_mode") == "external":
+            # calculate shape of input 1 (weights)
+            folded_input_shape = tuple([1, sf * nf, pe])
+        else:
+            raise Exception("Undefined input shape for requested input")
+
         return folded_input_shape
 
-    def get_folded_output_shape(self):
+    def get_folded_output_shape(self, ind=0):
         ch = self.get_nodeattr("Channels")
         pe = self.get_nodeattr("PE")
         nf = ch // pe
@@ -216,14 +303,14 @@ def get_folded_output_shape(self):
         folded_output_shape = tuple([1, dim_h, dim_w, nf, pe])
         return folded_output_shape
 
-    def get_normal_input_shape(self):
+    def get_normal_input_shape(self, ind=0):
         dim_h, dim_w = self.get_nodeattr("Dim")
         ch = self.get_nodeattr("Channels")
         k_h, k_w = self.get_nodeattr("Kernel")
         normal_input_shape = tuple([1, dim_h, dim_w, k_h * k_w * ch])
         return normal_input_shape
 
-    def get_normal_output_shape(self):
+    def get_normal_output_shape(self, ind=0):
         ch = self.get_nodeattr("Channels")
         dim_h, dim_w = self.get_nodeattr("Dim")
         normal_output_shape = tuple([1, dim_h, dim_w, ch])
@@ -235,6 +322,7 @@ def get_number_output_values(self):
 
     def get_exp_cycles(self):
         pe = self.get_nodeattr("PE")
+        simd = self.get_nodeattr("SIMD")
         ch = self.get_nodeattr("Channels")
         dim_h, dim_w = self.get_nodeattr("Dim")
         k_h, k_w = self.get_nodeattr("Kernel")
@@ -242,7 +330,7 @@ def get_exp_cycles(self):
         batch_size = 1
         # since mmv != 1 is not supported yet, we set mmv for now to 1
         mmv = 1
-        exp_cycles = ((ch * k_h * k_w) / pe) * batch_size * (dim_h * dim_w) / mmv
+        exp_cycles = ((ch * k_h * k_w) / pe / simd) * batch_size * (dim_h * dim_w) / mmv
         return int(exp_cycles)
 
     def get_template_param_values(self):
@@ -251,13 +339,31 @@ def get_template_param_values(self):
         ret = dict()
         inp_hls_str = self.get_input_datatype().get_hls_datatype_str()
         out_hls_str = self.get_output_datatype().get_hls_datatype_str()
+        inp_is_binary = self.get_input_datatype() == DataType["BINARY"]
+        # out_is_binary = self.get_output_datatype() == DataType["BINARY"]
+        wt_is_binary = self.get_weight_datatype() == DataType["BINARY"]
+        bin_xnor_mode = self.get_nodeattr("binaryXnorMode") == 1
+        if (inp_is_binary or wt_is_binary) and (not bin_xnor_mode):
+            raise Exception("True binary (non-bipolar) inputs not yet supported")
         inp_is_bipolar = self.get_input_datatype() == DataType["BIPOLAR"]
+        # out_is_bipolar = self.get_output_datatype() == DataType["BIPOLAR"]
         wt_is_bipolar = self.get_weight_datatype() == DataType["BIPOLAR"]
+        # reinterpret inp/wt as bipolar if bin_xnor_mode is iset
+        inp_is_bipolar = inp_is_bipolar or (inp_is_binary and bin_xnor_mode)
+        wt_is_bipolar = wt_is_bipolar or (wt_is_binary and bin_xnor_mode)
         # fill in TSrcI and TWeightI
-        # TODO handle bipolar inputs
-        if inp_is_bipolar or wt_is_bipolar:
-            raise Exception("VVAU node doesn't support bipolar values yet.")
-        else:
+        # TODO check these with Giulio
+        # TODO handle non-bipolar binary inputs
+        if inp_is_bipolar and wt_is_bipolar:
+            ret["TSrcI"] = "Recast<XnorMul>"
+            ret["TWeightI"] = "Identity"
+        elif (not inp_is_bipolar) and wt_is_bipolar:
+            ret["TSrcI"] = "Slice<%s>" % inp_hls_str
+            ret["TWeightI"] = "Recast<Binary>"
+        elif inp_is_bipolar and (not wt_is_bipolar):
+            ret["TSrcI"] = "Recast<Binary>"
+            ret["TWeightI"] = "Identity"
+        elif (not inp_is_bipolar) and (not wt_is_bipolar):
             ret["TSrcI"] = "Slice<%s>" % inp_hls_str
             ret["TWeightI"] = "Identity"
 
@@ -268,6 +374,7 @@ def get_template_param_values(self):
 
     def get_hls_compatible_weight_tensor(self, orig_weight_matrix):
         pe = self.get_nodeattr("PE")
+        simd = self.get_nodeattr("SIMD")
         ch = self.get_nodeattr("Channels")
         k_h, k_w = self.get_nodeattr("Kernel")
         wmem = self.calc_wmem()
@@ -279,13 +386,23 @@ def get_hls_compatible_weight_tensor(self, orig_weight_matrix):
         ), """Weights matrix doesn't
         have expected shape (channels, 1, kernel_size, kernel_size)"""
         ret = orig_weight_matrix
+        if self.get_weight_datatype() == DataType["BIPOLAR"]:
+            # convert bipolar to binary
+            ret = (ret + 1) / 2
         ret = ret.reshape(ch, k_h * k_w)
         # distribute rows between PEs
         ret = interleave_matrix_outer_dim_from_partitions(ret, pe)
-        ret = ret.reshape(1, pe, wmem, 1)
+        ret = ret.reshape(1, pe, wmem, simd)
         return ret
 
     def get_hls_compatible_threshold_tensor(self, orig_thres_matrix):
+        """Convert the original numpy weight matrix orig_weight_matrix into
+        a form suitable for passing to the hlslib call:
+        * ensure MH % PE == 0
+        * for bipolar weights&inputs, ensure thresholds are positive
+        * interleave rows between PEs
+        * reshape into (PE, TMEM, n_thres_steps) and return
+        """
         ch = self.get_nodeattr("Channels")
         pe = self.get_nodeattr("PE")
         tmem = self.calc_tmem()
@@ -295,14 +412,24 @@ def get_hls_compatible_threshold_tensor(self, orig_thres_matrix):
         ), """Threshold matrix dimension is
         not as expected (2)."""
         n_thres_steps = orig_thres_matrix.shape[1]
+        inp_is_bipolar = self.get_input_datatype() == DataType["BIPOLAR"]
+        wt_is_bipolar = self.get_weight_datatype() == DataType["BIPOLAR"]
+        # reinterpret inp/wt as bipolar if bin_xnor_mode is iset
+        inp_is_binary = self.get_input_datatype() == DataType["BINARY"]
+        wt_is_binary = self.get_weight_datatype() == DataType["BINARY"]
+        bin_xnor_mode = self.get_nodeattr("binaryXnorMode") == 1
+        inp_is_bipolar = inp_is_bipolar or (inp_is_binary and bin_xnor_mode)
+        wt_is_bipolar = wt_is_bipolar or (wt_is_binary and bin_xnor_mode)
+        if inp_is_bipolar and wt_is_bipolar:
+            # ensure all thresholds are nonnegative
+            assert (orig_thres_matrix >= 0).all()
+            # ensure all thresholds are integer
+            assert (orig_thres_matrix.astype(np.int32) == orig_thres_matrix).all()
         ret = orig_thres_matrix
-        # workaround for vivado_hls threshold bug
-        if ret[0][0] == 0:
-            ret = np.copy(ret)
-            ret[0][0] = 1
-            warnings.warn(
-                "Setting 0-valued first threshold to 1 to avoid vivado_hls bug"
-            )
+        # ensure channels = mh , duplicating if necessary
+        if ret.shape[0] == 1:
+            ret = np.tile(ret, (ch, 1))
+        assert ret.shape[0] == ch, "Channels of threshold matrix are not as expected (ch)"
         # distribute rows between PEs
         ret = interleave_matrix_outer_dim_from_partitions(ret, pe)
         assert (
@@ -319,43 +446,151 @@ def get_hls_compatible_threshold_tensor(self, orig_thres_matrix):
         rows between PEs is not as expected (n_thres_steps)"""
         return ret.reshape(1, pe, tmem, n_thres_steps)
 
-    def generate_params(self, model, path):
-        # weights
-        weights = model.get_initializer(self.onnx_node.input[1])
+    def make_weight_file(self, weights, weight_file_mode, weight_file_name):
+        """Produce a file containing given weights in appropriate format for this
+        layer. This file can be used for either synthesis or run-time reconfig
+        of weights.
+
+        Arguments:
+
+        * weights : numpy array with weights to be put into the file
+        * weight_file_mode : one of {hls_header, decoupled_verilog_dat,
+          decoupled_runtime}
+        * weight_file_name : filename for the weight file to be generated
+
+        """
         # convert weights into hlslib-compatible format
         weight_tensor = self.get_hls_compatible_weight_tensor(weights)
-        wdt = self.get_weight_datatype()
-        code_gen_dir = path
+        export_wdt = self.get_weight_datatype()
+        # we have converted bipolar weights to binary for export,
+        # so use it as such for weight generation
+        if self.get_weight_datatype() == DataType["BIPOLAR"]:
+            export_wdt = DataType["BINARY"]
+        if weight_file_mode == "hls_header":
+            weight_hls_code = numpy_to_hls_code(weight_tensor, export_wdt, "weights", True, True)
+            # write weights into C++ header file as dictated by finn-hlslib
+            f_weights = open(weight_file_name, "w")
+            if export_wdt.bitwidth() != 1:
+                f_weights.write(
+                    "const FixedPointWeights<{},{},{},{}> weights = ".format(
+                        self.get_nodeattr("SIMD"),
+                        export_wdt.get_hls_datatype_str(),
+                        self.get_nodeattr("PE"),
+                        self.calc_wmem(),
+                    )
+                )
+            else:
+                f_weights.write(
+                    "const BinaryWeights<{},{},{}> weights = ".format(
+                        self.get_nodeattr("SIMD"),
+                        self.get_nodeattr("PE"),
+                        self.calc_wmem(),
+                    )
+                )
+            f_weights.write(weight_hls_code)
+            f_weights.close()
+        elif "decoupled" in weight_file_mode:
+            # create a weight stream for various flavors of decoupled mode:
+            # transpose weight tensor from (1, PE, WMEM, SIMD) to (1, WMEM, PE, SIMD)
+            weight_tensor_unflipped = np.transpose(weight_tensor, (0, 2, 1, 3))
+            # reverse SIMD flip for saving weights in .npy
+            weight_tensor_simd_flipped = np.flip(weight_tensor_unflipped, axis=-1)
+            # PE flip for saving weights in .dat
+            weight_tensor_pe_flipped = np.flip(weight_tensor_unflipped, axis=-2)
+            # reshape weight tensor (simd_flipped and pe_flipped) to desired shape
+            pe = self.get_nodeattr("PE")
+            simd = self.get_nodeattr("SIMD")
+            # simd_flipped
+            weight_tensor_simd_flipped = weight_tensor_simd_flipped.reshape(1, -1, pe * simd)
+            weight_tensor_simd_flipped = weight_tensor_simd_flipped.copy()
+            # flipped
+            weight_tensor_pe_flipped = weight_tensor_pe_flipped.reshape(1, -1, pe * simd)
+            weight_tensor_pe_flipped = weight_tensor_pe_flipped.copy()
+            if weight_file_mode == "decoupled_npy":
+                # save weight stream into npy for cppsim
+                np.save(weight_file_name, weight_tensor_simd_flipped)
+            elif weight_file_mode == "decoupled_verilog_dat":
+                # convert weight values into hexstring
+                weight_width = self.get_weightstream_width()
+                # pad to nearest 4 bits to get hex strings
+                weight_width_padded = roundup_to_integer_multiple(weight_width, 4)
+                weight_tensor_pe_flipped = pack_innermost_dim_as_hex_string(
+                    weight_tensor_pe_flipped, export_wdt, weight_width_padded, prefix=""
+                )
+                # add zeroes to pad out file to 1024 entries
+                weight_stream = weight_tensor_pe_flipped.flatten()
+                weight_stream = weight_stream.copy()
+                with open(weight_file_name, "w") as f:
+                    for val in weight_stream:
+                        f.write(val + "\n")
+            elif weight_file_mode == "decoupled_runtime":
+                # memstream axi-lite interface will map each mem line to
+                # one or multiple 32-bit words
+                weight_width = self.get_weightstream_width()
+                words_per_memwidth = 2 ** math.ceil(math.log2(weight_width / 32))
+                if words_per_memwidth < 1:
+                    words_per_memwidth = 1
+                weight_width_padded = words_per_memwidth * 32
+                # first, pack and ensure padding to 32 bits
+                weight_tensor_pe_flipped = pack_innermost_dim_as_hex_string(
+                    weight_tensor_pe_flipped, export_wdt, weight_width_padded, prefix=""
+                )
+                weight_stream = weight_tensor_pe_flipped.flatten()
+                weight_stream = weight_stream.copy()
+                with open(weight_file_name, "w") as f:
+                    for val in weight_stream:
+                        # split into groups of 8 hex digits (= 32 bits)
+                        words_32b = textwrap.wrap(val, 8)
+                        words_32b.reverse()
+                        for word_32b in words_32b:
+                            f.write(word_32b + "\n")
+            else:
+                raise Exception("Unknown weight_file_mode")
 
-        """Saves weights into params.h"""
-        weight_hls_code = numpy_to_hls_code(weight_tensor, wdt, "weights", True, True)
-        # write weights into params.h
-        f_weights = open("{}/params.h".format(code_gen_dir), "w")
+        else:
+            raise Exception("Unknown weight_file_mode")
 
-        if wdt.bitwidth() != 1:
-            f_weights.write(
-                "const FixedPointWeights<1,{},{},{}> weights = ".format(
-                    wdt.get_hls_datatype_str(),
-                    self.get_nodeattr("PE"),
-                    self.calc_wmem(),
-                )
-            )
+    def generate_params(self, model, path):
+        mem_mode = self.get_nodeattr("mem_mode")
+        code_gen_dir = path
+        # weights, if not external
+        weights = model.get_initializer(self.onnx_node.input[1])
+        if mem_mode == "const":
+            # save hlslib-compatible weights in params.h
+            weight_filename = "{}/params.h".format(code_gen_dir)
+            self.make_weight_file(weights, "hls_header", weight_filename)
+        elif mem_mode == "decoupled" or mem_mode == "external":
+            weight_filename_sim = "{}/weights.npy".format(code_gen_dir)
+            # save decoupled weights for cppsim
+            self.make_weight_file(weights, "decoupled_npy", weight_filename_sim)
+            if mem_mode == "decoupled":
+                # also save weights as Verilog .dat file
+                # This file will be ignored when synthesizing UltraScale memory.
+                weight_filename_rtl = "{}/memblock.dat".format(code_gen_dir)
+                self.make_weight_file(weights, "decoupled_verilog_dat", weight_filename_rtl)
         else:
-            f_weights.write(
-                "const BinaryWeights<1,{},{}> weights = ".format(
-                    self.get_nodeattr("PE"), self.calc_wmem()
-                )
+            raise Exception(
+                """Please set mem_mode to "const", "decoupled", or "external",
+                currently no other parameter value is supported!"""
             )
-        f_weights.write(weight_hls_code)
-        f_weights.close()
 
         # save thresholds in thresh.h
         if len(self.onnx_node.input) > 2:
             thresholds = model.get_initializer(self.onnx_node.input[2])
             if thresholds is not None:
                 threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds)
+                # use UINT32 threshold export for bipolar times bipolar
+                inp_is_bipolar = self.get_input_datatype() == DataType["BIPOLAR"]
+                wt_is_bipolar = self.get_weight_datatype() == DataType["BIPOLAR"]
+                # reinterpret inp/wt as bipolar if bin_xnor_mode is iset
+                inp_is_binary = self.get_input_datatype() == DataType["BINARY"]
+                wt_is_binary = self.get_weight_datatype() == DataType["BINARY"]
+                bin_xnor_mode = self.get_nodeattr("binaryXnorMode") == 1
+                inp_is_bipolar = inp_is_bipolar or (inp_is_binary and bin_xnor_mode)
+                wt_is_bipolar = wt_is_bipolar or (wt_is_binary and bin_xnor_mode)
                 # get computed threshold datatype from attribute
                 tdt = DataType[self.get_nodeattr("accDataType")]
+
                 assert np.vectorize(tdt.allowed)(
                     threshold_tensor
                 ).all(), "Thresholds in %s can't be expressed with type %s" % (
@@ -368,8 +603,11 @@ def generate_params(self, model, path):
                 # write thresholds into thresh.h
                 f_thresh = open("{}/thresh.h".format(code_gen_dir), "w")
                 tdt_hls = tdt.get_hls_datatype_str()
-                odt = self.get_output_datatype()
-                odt_hls = odt.get_hls_datatype_str()
+                # use binary to export bipolar activations
+                export_odt = self.get_output_datatype()
+                if self.get_output_datatype() == DataType["BIPOLAR"]:
+                    export_odt = DataType["BINARY"]
+                odt_hls = export_odt.get_hls_datatype_str()
                 f_thresh.write(
                     "static ThresholdsActivation<{},{},{},{},{},{},{}> threshs \
                     = ".format(
@@ -387,6 +625,7 @@ def generate_params(self, model, path):
 
     def execute_node(self, context, graph):
         mode = self.get_nodeattr("exec_mode")
+        mem_mode = self.get_nodeattr("mem_mode")
         node = self.onnx_node
 
         # TODO ensure codegen dir exists
@@ -415,6 +654,12 @@ def execute_node(self, context, graph):
                 not float32 as expected."""
                 expected_inp_shape = self.get_folded_input_shape()
                 reshaped_input = context[inputs].reshape(expected_inp_shape)
+                if self.get_input_datatype() == DataType["BIPOLAR"]:
+                    # store bipolar activations as binary
+                    reshaped_input = (reshaped_input + 1) / 2
+                    export_idt = DataType["BINARY"]
+                else:
+                    export_idt = self.get_input_datatype()
                 # make copy before saving the array
                 reshaped_input = reshaped_input.copy()
                 np.save(
@@ -430,25 +675,46 @@ def execute_node(self, context, graph):
             super().exec_precompiled_singlenode_model()
             # load output npy file
             super().npy_to_dynamic_output(context)
+            # reinterpret binary output as bipolar where needed
+            if self.get_output_datatype() == DataType["BIPOLAR"]:
+                out = context[node.output[0]]
+                out = 2 * out - 1
+                context[node.output[0]] = out
             assert (
                 context[node.output[0]].shape == self.get_normal_output_shape()
             ), "cppsim did not produce expected output shape"
         elif mode == "rtlsim":
             sim = self.get_rtlsim()
             nbits = self.get_instream_width()
-            idt = self.get_input_datatype()
-            inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), idt, nbits)
+            inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits)
             super().reset_rtlsim(sim)
             super().toggle_clk(sim)
-            output = self.rtlsim(sim, inp)
+
+            if mem_mode == "external" or mem_mode == "decoupled":
+                wnbits = self.get_weightstream_width()
+                export_wdt = self.get_weight_datatype()
+                # we have converted bipolar weights to binary for export,
+                # so use it as such for weight generation
+                if self.get_weight_datatype() == DataType["BIPOLAR"]:
+                    export_wdt = DataType["BINARY"]
+                wei = npy_to_rtlsim_input("{}/weights.npy".format(code_gen_dir), export_wdt, wnbits)
+                dim_h, dim_w = self.get_nodeattr("Dim")
+                num_w_reps = dim_h * dim_w
+
+                io_dict = {
+                    "inputs": {"in0": inp, "weights": wei * num_w_reps},
+                    "outputs": {"out": []},
+                }
+                self.rtlsim_multi_io(sim, io_dict)
+                output = io_dict["outputs"]["out"]
+            else:
+                output = self.rtlsim(sim, inp)
             odt = self.get_output_datatype()
             target_bits = odt.bitwidth()
             packed_bits = self.get_outstream_width()
             out_npy_path = "{}/output.npy".format(code_gen_dir)
             out_shape = self.get_folded_output_shape()
-            rtlsim_output_to_npy(
-                output, out_npy_path, odt, out_shape, packed_bits, target_bits
-            )
+            rtlsim_output_to_npy(output, out_npy_path, odt, out_shape, packed_bits, target_bits)
 
             # load and reshape output
             output = np.load(out_npy_path)
@@ -466,6 +732,12 @@ def execute_node(self, context, graph):
     def global_includes(self):
         self.code_gen_dict["$GLOBALS$"] = ['#include "weights.hpp"']
         self.code_gen_dict["$GLOBALS$"] += ['#include "activations.hpp"']
+        mem_mode = self.get_nodeattr("mem_mode")
+        if mem_mode not in ["const", "decoupled", "external"]:
+            raise Exception(
+                """Please set mem_mode to "const", "decoupled", or "external",
+                currently no other parameter value is supported!"""
+            )
         if self.calc_tmem() != 0:
             self.code_gen_dict["$GLOBALS$"] += ['#include "thresh.h"']
 
@@ -474,19 +746,28 @@ def defines(self, var):
         numReps = 1 * dim_h * dim_w
         k_h, k_w = self.get_nodeattr("Kernel")
         innerProdDim = k_h * k_w
+        mem_mode = self.get_nodeattr("mem_mode")
+
         self.code_gen_dict["$DEFINES$"] = [
             """#define Channels1 {}\n #define InnerProdDim {}\n
-            #define SIMD1 1\n #define PE1 {}\n #define numReps {}""".format(
+            #define SIMD1 {}\n #define PE1 {}\n #define numReps {}""".format(
                 self.get_nodeattr("Channels"),
                 innerProdDim,
+                self.get_nodeattr("SIMD"),
                 self.get_nodeattr("PE"),
                 numReps,
             )
         ]
+        if mem_mode == "decoupled" or mem_mode == "external":
+            wdt = self.get_weight_datatype()
+            self.code_gen_dict["$DEFINES$"].append("#define WP1 {}\n".format(wdt.bitwidth()))
 
     def read_npy_data(self):
         code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
         dtype = self.get_input_datatype()
+        if dtype == DataType["BIPOLAR"]:
+            # use binary for bipolar storage
+            dtype = DataType["BINARY"]
         elem_bits = dtype.bitwidth()
         packed_bits = self.get_instream_width()
         packed_hls_type = "ap_uint<%d>" % packed_bits
@@ -496,20 +777,61 @@ def read_npy_data(self):
         self.code_gen_dict["$READNPYDATA$"] = []
         # note: the innermost dim is reversed for the input
         self.code_gen_dict["$READNPYDATA$"].append(
-            'npy2apintstream<%s, %s, %d, %s>("%s", in0, false);'
-            % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in)
+            'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s, false);'
+            % (
+                packed_hls_type,
+                elem_hls_type,
+                elem_bits,
+                npy_type,
+                npy_in,
+                self.hls_sname(),
+            )
         )
 
+        mem_mode = self.get_nodeattr("mem_mode")
+        if mem_mode == "decoupled" or mem_mode == "external":
+            wdt = self.get_weight_datatype()
+            elem_bits = wdt.bitwidth()
+            packed_bits = self.get_weightstream_width()
+            packed_hls_type = "ap_uint<%d>" % packed_bits
+            elem_hls_type = wdt.get_hls_datatype_str()
+            npy_type = "float"
+            npy_in = "%s/weights.npy" % code_gen_dir
+
+            self.code_gen_dict["$READNPYDATA$"].append(
+                'npy2apintstream<%s, %s, %d, %s>("%s", weights_%s, false, numReps);'
+                % (
+                    packed_hls_type,
+                    elem_hls_type,
+                    elem_bits,
+                    npy_type,
+                    npy_in,
+                    self.hls_sname(),
+                )
+            )
+
     def strm_decl(self):
+        mem_mode = self.get_nodeattr("mem_mode")
         self.code_gen_dict["$STREAMDECLARATIONS$"] = []
         self.code_gen_dict["$STREAMDECLARATIONS$"].append(
-            'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.get_instream_width())
+            'hls::stream<ap_uint<{}>> in0_{} ("in0_{}");'.format(
+                self.get_instream_width(), self.hls_sname(), self.hls_sname()
+            )
         )
         self.code_gen_dict["$STREAMDECLARATIONS$"].append(
-            'hls::stream<ap_uint<{}>> out ("out");'.format(self.get_outstream_width())
+            'hls::stream<ap_uint<{}>> out_{} ("out_{}");'.format(
+                self.get_outstream_width(), self.hls_sname(), self.hls_sname()
+            )
         )
+        if mem_mode == "decoupled" or mem_mode == "external":
+            self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+                'hls::stream<ap_uint<{}>> weights_{} ("weights_{}");'.format(
+                    self.get_weightstream_width(), self.hls_sname(), self.hls_sname()
+                )
+            )
 
     def docompute(self):
+        mem_mode = self.get_nodeattr("mem_mode")
         map_to_hls_mult_style = {
             "auto": "ap_resource_dflt()",
             "lut": "ap_resource_lut()",
@@ -521,20 +843,54 @@ def docompute(self):
             threshs = "PassThroughActivation<%s>()" % odtype_hls_str
         else:
             threshs = "threshs"
-        self.code_gen_dict["$DOCOMPUTE$"] = [
-            """Vector_Vector_Activate_Batch<Channels1, InnerProdDim, SIMD1, PE1, 1, {}, {}, {}>
-            (in0, out, weights, {}, numReps, {});""".format(
-                tmpl_args["TSrcI"],
-                tmpl_args["TDstI"],
-                tmpl_args["TWeightI"],
-                threshs,
-                map_to_hls_mult_style[self.get_nodeattr("resType")],
+
+        if mem_mode == "const":
+            self.code_gen_dict["$DOCOMPUTE$"] = [
+                """Vector_Vector_Activate_Batch<Channels1, InnerProdDim, SIMD1, PE1, 1, {}, {}, {}>
+                (in0_{}, out_{}, weights, {}, numReps, {});""".format(
+                    tmpl_args["TSrcI"],
+                    tmpl_args["TDstI"],
+                    tmpl_args["TWeightI"],
+                    self.hls_sname(),
+                    self.hls_sname(),
+                    threshs,
+                    map_to_hls_mult_style[self.get_nodeattr("resType")],
+                )
+            ]
+        elif mem_mode == "decoupled" or mem_mode == "external":
+            wdt = self.get_weight_datatype()
+            if wdt == DataType["BIPOLAR"]:
+                export_wdt = DataType["BINARY"]
+            else:
+                export_wdt = wdt
+            wdtype_hls_str = export_wdt.get_hls_datatype_str()
+            self.code_gen_dict["$DOCOMPUTE$"] = [
+                """{}<Channels1, InnerProdDim, SIMD1, PE1, 1, {}, {}, {}, {}>
+                (in0_{}, out_{}, weights_{}, {}, numReps, {});""".format(
+                    "Vector_Vector_Activate_Stream_Batch",
+                    tmpl_args["TSrcI"],
+                    tmpl_args["TDstI"],
+                    tmpl_args["TWeightI"],
+                    wdtype_hls_str,
+                    self.hls_sname(),
+                    self.hls_sname(),
+                    self.hls_sname(),
+                    threshs,
+                    map_to_hls_mult_style[self.get_nodeattr("resType")],
+                )
+            ]
+        else:
+            raise Exception(
+                """Please set mem_mode to "const", "decoupled", or "external",
+                currently no other parameter value is supported!"""
             )
-        ]
 
     def dataoutstrm(self):
         code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
         dtype = self.get_output_datatype()
+        if dtype == DataType["BIPOLAR"]:
+            # use binary for bipolar storage
+            dtype = DataType["BINARY"]
         elem_bits = dtype.bitwidth()
         packed_bits = self.get_outstream_width()
         packed_hls_type = "ap_uint<%d>" % packed_bits
@@ -546,12 +902,13 @@ def dataoutstrm(self):
 
         # note: the innermost dim is not reversed for the output
         self.code_gen_dict["$DATAOUTSTREAM$"] = [
-            'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s", false);'
+            'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s", false);'
             % (
                 packed_hls_type,
                 elem_hls_type,
                 elem_bits,
                 npy_type,
+                self.hls_sname(),
                 shape_cpp_str,
                 npy_out,
             )
@@ -561,84 +918,251 @@ def save_as_npy(self):
         self.code_gen_dict["$SAVEASCNPY$"] = []
 
     def blackboxfunction(self):
-        self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
-            """void {}(hls::stream<ap_uint<{}>> &in0,
-            hls::stream<ap_uint<{}>> &out
-            )""".format(
-                self.onnx_node.name,
-                self.get_instream_width(),
-                self.get_outstream_width(),
+        mem_mode = self.get_nodeattr("mem_mode")
+        if mem_mode == "const":
+            self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
+                """void {}(hls::stream<ap_uint<{}>> &in0_{},
+                hls::stream<ap_uint<{}>> &out_{}
+                )""".format(
+                    self.onnx_node.name,
+                    self.get_instream_width(),
+                    self.hls_sname(),
+                    self.get_outstream_width(),
+                    self.hls_sname(),
+                )
+            ]
+        elif mem_mode == "decoupled" or mem_mode == "external":
+            self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
+                """void {}(
+                    hls::stream<ap_uint<{}>> &in0_{},
+                    hls::stream<ap_uint<{}>> &weights_{},
+                    hls::stream<ap_uint<{}>> &out_{}
+                    )""".format(
+                    self.onnx_node.name,
+                    self.get_instream_width(),
+                    self.hls_sname(),
+                    self.get_weightstream_width(),
+                    self.hls_sname(),
+                    self.get_outstream_width(),
+                    self.hls_sname(),
+                )
+            ]
+        else:
+            raise Exception(
+                """Please set mem_mode to "const" or "decoupled", currently no other
+                    parameter value is supported!"""
             )
-        ]
 
     def pragmas(self):
+        mem_mode = self.get_nodeattr("mem_mode")
         self.code_gen_dict["$PRAGMAS$"] = [
-            "#pragma HLS INTERFACE axis port=in0 name=in0_" + self.hls_sname()
+            "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname()
         ]
         self.code_gen_dict["$PRAGMAS$"].append(
-            "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname()
+            "#pragma HLS INTERFACE axis port=out_" + self.hls_sname()
         )
-        in_fifo_depth = self.get_nodeattr("inFIFODepth")
-        out_fifo_depth = self.get_nodeattr("outFIFODepth")
-        # insert depth pragmas only if specified
-        if in_fifo_depth != 0:
+        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return")
+
+        if mem_mode == "const":
+            self.code_gen_dict["$PRAGMAS$"].append('#include "params.h"')
+            # the weight tensor is ap_uint<ch*prec> [PE][WMEM]
+            # partition for parallel access along the PE dimension (dim 1)
             self.code_gen_dict["$PRAGMAS$"].append(
-                "#pragma HLS stream depth=%d variable=in0" % in_fifo_depth
+                ("#pragma HLS ARRAY_PARTITION variable=weights.m_weights " "complete dim=1")
             )
-        if out_fifo_depth != 0:
+        elif mem_mode == "decoupled" or mem_mode == "external":
             self.code_gen_dict["$PRAGMAS$"].append(
-                "#pragma HLS stream depth=%d variable=out" % out_fifo_depth
+                "#pragma HLS INTERFACE axis port=weights_" + self.hls_sname()
+            )
+        else:
+            raise Exception(
+                """Please set mem_mode to "const", "decoupled", or external,
+                currently no other parameter value is supported!"""
             )
-        self.code_gen_dict["$PRAGMAS$"].append(
-            "#pragma HLS INTERFACE ap_ctrl_none port=return"
-        )
 
-        self.code_gen_dict["$PRAGMAS$"].append('#include "params.h"')
-        # the weight tensor is ap_uint<ch*prec> [PE][WMEM]
-        # partition for parallel access along the PE dimension (dim 1)
-        self.code_gen_dict["$PRAGMAS$"].append(
-            ("#pragma HLS ARRAY_PARTITION variable=weights.m_weights " "complete dim=1")
-        )
         if self.calc_tmem() != 0:
             # TODO find a better way of checking for no pregenerated thresholds
             self.code_gen_dict["$PRAGMAS$"].append(
-                (
-                    "#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds "
-                    "complete dim=1"
-                )
+                ("#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds " "complete dim=1")
             )
             self.code_gen_dict["$PRAGMAS$"].append(
-                (
-                    "#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds "
-                    "complete dim=3"
+                ("#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds " "complete dim=3")
+            )
+
+    def get_verilog_top_module_intf_names(self):
+        intf_names = super().get_verilog_top_module_intf_names()
+        mem_mode = self.get_nodeattr("mem_mode")
+        sname = self.hls_sname()
+        if mem_mode == "external":
+            intf_names["s_axis"].append(("weights_" + sname, self.get_weightstream_width_padded()))
+        if mem_mode == "decoupled":
+            # only expose axilite interface if attribute is set
+            runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1
+            if runtime_writable:
+                intf_names["axilite"] = ["s_axilite"]
+        return intf_names
+
+    def code_generation_ipi(self):
+        cmd = []
+        # add streamer if needed
+        mem_mode = self.get_nodeattr("mem_mode")
+        if mem_mode == "decoupled":
+            runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1
+            if self.get_nodeattr("ram_style") == "ultra":
+                assert (
+                    runtime_writable == 1
+                ), "Layer with URAM weights must have runtime_writeable_weights=1"
+            node_name = self.onnx_node.name
+            sname = self.hls_sname()
+            # create a hierarchy for this layer, with the same port names
+            clk_name = self.get_verilog_top_module_intf_names()["clk"][0]
+            rst_name = self.get_verilog_top_module_intf_names()["rst"][0]
+            dout_name = self.get_verilog_top_module_intf_names()["m_axis"][0][0]
+            din_name = self.get_verilog_top_module_intf_names()["s_axis"][0][0]
+            cmd.append("create_bd_cell -type hier %s" % node_name)
+            cmd.append("create_bd_pin -dir I -type clk /%s/%s" % (node_name, clk_name))
+            cmd.append("create_bd_pin -dir I -type rst /%s/%s" % (node_name, rst_name))
+            cmd.append(
+                "create_bd_intf_pin -mode Master "
+                "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, dout_name)
+            )
+            cmd.append(
+                "create_bd_intf_pin -mode Slave "
+                "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, din_name)
+            )
+            # instantiate the hls ip
+            cmd.append(
+                "create_bd_cell -type ip -vlnv %s /%s/%s"
+                % (self.get_nodeattr("ip_vlnv"), node_name, node_name)
+            )
+            # instantiate a streamer and connect it to the HLS IP
+            strm_vlnv = "amd.com:finn:memstream:1.0"
+            strm_inst = node_name + "_wstrm"
+            cmd.append(
+                "create_bd_cell -type ip -vlnv %s /%s/%s" % (strm_vlnv, node_name, strm_inst)
+            )
+            cmd.append(
+                "set_property -dict [list "
+                "CONFIG.DEPTH {%d} "
+                "CONFIG.WIDTH {%d} "
+                "CONFIG.INIT_FILE {%s} "
+                "CONFIG.RAM_STYLE {%s} "
+                "] [get_bd_cells /%s/%s]"
+                % (
+                    self.calc_wmem(),
+                    self.get_weightstream_width_padded(),
+                    self.get_nodeattr("code_gen_dir_ipgen") + "/memblock.dat",
+                    self.get_nodeattr("ram_style"),
+                    node_name,
+                    strm_inst,
                 )
             )
+            cmd.append(
+                "connect_bd_intf_net [get_bd_intf_pins %s/%s/m_axis_0] "
+                "[get_bd_intf_pins %s/%s/weights_%s]"
+                % (node_name, strm_inst, node_name, node_name, sname)
+            )
+            cmd.append(
+                "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/ap_rst_n]"
+                % (node_name, rst_name, node_name, strm_inst)
+            )
+            cmd.append(
+                "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/ap_clk]"
+                % (node_name, clk_name, node_name, strm_inst)
+            )
+            cmd.append(
+                "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/%s]"
+                % (node_name, rst_name, node_name, node_name, rst_name)
+            )
+            cmd.append(
+                "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/%s]"
+                % (node_name, clk_name, node_name, node_name, clk_name)
+            )
+            cmd.append(
+                "connect_bd_intf_net [get_bd_intf_pins %s/%s] "
+                "[get_bd_intf_pins %s/%s/%s]"
+                % (node_name, din_name, node_name, node_name, din_name)
+            )
+            cmd.append(
+                "connect_bd_intf_net [get_bd_intf_pins %s/%s] "
+                "[get_bd_intf_pins %s/%s/%s]"
+                % (node_name, dout_name, node_name, node_name, dout_name)
+            )
+            if runtime_writable:
+                # expose axi lite interface for writeable weights
+                axilite_name = self.get_verilog_top_module_intf_names()["axilite"][0]
+                cmd.append(
+                    "create_bd_intf_pin -mode Slave "
+                    "-vlnv xilinx.com:interface:aximm_rtl:1.0 /%s/%s" % (node_name, axilite_name)
+                )
+                cmd.append(
+                    "connect_bd_intf_net [get_bd_intf_pins %s/%s] "
+                    "[get_bd_intf_pins %s/%s/%s]"
+                    % (node_name, axilite_name, node_name, strm_inst, axilite_name)
+                )
+                # TODO calculate and pass in segment size here
+                cmd.append("assign_bd_address")
+            cmd.append("save_bd_design")
+        elif mem_mode == "const" or mem_mode == "external":
+            # base class impl sufficient for const/external modes
+            return super().code_generation_ipi()
+        else:
+            raise Exception("Unrecognized mem_mode for VectorVectorActivation")
+        return cmd
+
+    def uram_estimation(self):
+        P = self.get_nodeattr("PE")
+        Q = self.get_nodeattr("SIMD")
+        wdt = self.get_weight_datatype()
+        W = wdt.bitwidth()
+        omega = self.calc_wmem()
+        mem_width = Q * W * P
+        mmode = self.get_nodeattr("mem_mode")
+        mstyle = self.get_nodeattr("ram_style")
+        if (
+            (mmode == "decoupled" and mstyle != "ultra")
+            or (mmode == "const")
+            or (mmode == "external")
+        ):
+            return 0
+        width_multiplier = math.ceil(mem_width / 72)
+        depth_multiplier = math.ceil(omega / 4096)
+        return width_multiplier * depth_multiplier
 
     def bram_estimation(self):
         """Calculates resource estimation for BRAM"""
         # TODO add in/out FIFO contributions
         P = self.get_nodeattr("PE")
+        Q = self.get_nodeattr("SIMD")
         wdt = self.get_weight_datatype()
         W = wdt.bitwidth()
         omega = self.calc_wmem()
+        mem_width = Q * W * P
         # assuming SDP mode RAMB18s (see UG573 Table 1-10)
         # since this is HLS memory, not using the full width of a BRAM
         # assuming memories up to 128 deep get implemented in LUTs
-        if self.calc_wmem() <= 128:
+        mmode = self.get_nodeattr("mem_mode")
+        mstyle = self.get_nodeattr("ram_style")
+        if (
+            (mmode == "decoupled" and mstyle in ["distributed", "ultra"])
+            or (mstyle == "auto" and self.calc_wmem() <= 128)
+            or (mmode == "const" and self.calc_wmem() <= 128)
+            or (mmode == "external")
+        ):
             return 0
 
-        if W == 1:
-            return math.ceil(omega / 16384) * P
-        elif W == 2:
-            return math.ceil(omega / 8192) * P
-        elif W <= 4:
-            return (math.ceil(omega / 4096)) * (math.ceil(W / 4)) * P
-        elif W <= 9:
-            return (math.ceil(omega / 2048)) * (math.ceil(W / 8)) * P
-        elif W <= 18 or omega > 512:
-            return (math.ceil(omega / 1024)) * (math.ceil(W / 16)) * P
+        if mem_width == 1:
+            return math.ceil(omega / 16384)
+        elif mem_width == 2:
+            return math.ceil(omega / 8192)
+        elif mem_width <= 4:
+            return (math.ceil(omega / 4096)) * (math.ceil(mem_width / 4))
+        elif mem_width <= 9:
+            return (math.ceil(omega / 2048)) * (math.ceil(mem_width / 8))
+        elif mem_width <= 18 or omega > 512:
+            return (math.ceil(omega / 1024)) * (math.ceil(mem_width / 16))
         else:
-            return (math.ceil(omega / 512)) * (math.ceil(W / 32)) * P
+            return (math.ceil(omega / 512)) * (math.ceil(mem_width / 32))
 
     def bram_efficiency_estimation(self):
         P = self.get_nodeattr("PE")
@@ -662,6 +1186,7 @@ def lut_estimation(self):
         """
         # TODO add in/out FIFO contributions
         P = self.get_nodeattr("PE")
+        Q = self.get_nodeattr("SIMD")
         wdt = self.get_weight_datatype()
         W = wdt.bitwidth()
         # determine tdt with input and weight data types
@@ -671,30 +1196,49 @@ def lut_estimation(self):
         c0 = 300
         c1 = 1.1
         c2 = 0
-        if self.calc_wmem() <= 128:
-            c2 = P * W * math.ceil(self.calc_wmem() / 64)
+        mmode = self.get_nodeattr("mem_mode")
+        mstyle = self.get_nodeattr("ram_style")
+        if (mmode == "decoupled" and mstyle == "distributed") or (
+            mmode == "const" and self.calc_wmem() <= 128
+        ):
+            c2 = (P * Q * W) * math.ceil(self.calc_wmem() / 64)
 
         # multiplication
         res_type = self.get_nodeattr("resType")
         if res_type == "dsp":
             mult_luts = 0
         else:
-            mult_luts = (2 * math.ceil((W + A) / 6) - 1) * (W + A)
+            mult_luts = Q * (2 * math.ceil((W + A) / 6) - 1) * (W + A)
+        # adder tree
+        addertree_luts = (W + A) * (2 * Q - 1)
         # accumulator
+        acc_datatype = self.get_accumulator_datatype()
+        acc_bits = acc_datatype.bitwidth()
         k_h, k_w = self.get_nodeattr("Kernel")
-        acc_bits = W + A + math.ceil(math.log(k_h * k_w, 2))
+        # if accDataType is not set, then it will default to INT32, which would
+        # be a large overestimate in most (if not all) cases. In this scenario,
+        # we would use the minimum accumulator as determined by the data types
+        # bound, derived in https://arxiv.org/abs/2301.13376
+        alpha = math.log(k_h * k_w, 2) + W + A - 1 - int(idt.signed())
+        acc_bits = min(
+            acc_datatype.bitwidth(),
+            np.ceil(alpha + math.log(1 + pow(2, -alpha), 2) + 1),
+        )
         acc_luts = acc_bits
         # thresholds and threshold comparators
         thr_luts = 0
         comp_luts = 0
         noact = self.get_nodeattr("noActivation")
+        # TODO - add 'ram_style_threshold' node attribute
         if noact == 0:
             odt = self.get_output_datatype()
             B = odt.bitwidth()
-            thr_luts = (2**B - 1) * acc_bits * math.ceil(self.calc_tmem() / 64)
+            thr_luts = (2**B - 1) * acc_bits * self.calc_tmem() / 64
             comp_luts = (2**B - 1) * acc_bits
 
-        return int(c0 + c1 * (P * (mult_luts + acc_luts + thr_luts + comp_luts)) + c2)
+        return int(
+            c0 + c1 * (P * (mult_luts + addertree_luts + acc_luts + thr_luts + comp_luts)) + c2
+        )
 
     def dsp_estimation(self):
         # multiplication
@@ -710,6 +1254,26 @@ def dsp_estimation(self):
             mult_dsp = 0
         return int(mult_dsp)
 
+    def get_weightstream_width(self):
+        """Returns weight stream width. Used only in decoupled mode."""
+        if (
+            self.get_nodeattr("mem_mode") == "decoupled"
+            or self.get_nodeattr("mem_mode") == "external"
+        ):
+            simd = self.get_nodeattr("SIMD")
+            pe = self.get_nodeattr("PE")
+            wp = self.get_weight_datatype().bitwidth()
+            w_width = simd * pe * wp
+            return w_width
+        else:
+            return 0
+
+    def get_weightstream_width_padded(self):
+        """Returns weight stream width padded to a multiple of 8. This is required
+        by the AXI Stream spec. Used in decoupled mode."""
+        weight_width = self.get_weightstream_width()
+        return roundup_to_integer_multiple(weight_width, 8)
+
     def get_op_and_param_counts(self):
         k_h, k_w = self.get_nodeattr("Kernel")
         fm = self.get_nodeattr("Channels")
@@ -733,3 +1297,18 @@ def get_op_and_param_counts(self):
             thres_count = fm
             ret_dict[thres_param_type] = thres_count
         return ret_dict
+
+    def derive_characteristic_fxns(self, period):
+        n_inps = np.prod(self.get_folded_input_shape()[:-1])
+        io_dict = {
+            "inputs": {
+                "in0": [0 for i in range(n_inps)],
+            },
+            "outputs": {"out": []},
+        }
+        mem_mode = self.get_nodeattr("mem_mode")
+        if mem_mode in ["decoupled", "external"]:
+            n_weight_inps = self.calc_wmem()
+            num_w_reps = np.prod(self.get_nodeattr("numInputVectors"))
+            io_dict["inputs"]["weights"] = [0 for i in range(num_w_reps * n_weight_inps)]
+        super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict)
diff --git a/src/finn/qnn-data/build_dataflow/dataflow_build_config.json b/src/finn/qnn-data/build_dataflow/dataflow_build_config.json
index 27ec38f6a4..a053c1a22f 100644
--- a/src/finn/qnn-data/build_dataflow/dataflow_build_config.json
+++ b/src/finn/qnn-data/build_dataflow/dataflow_build_config.json
@@ -7,6 +7,7 @@
   "standalone_thresholds": true,
   "shell_flow_type": "vivado_zynq",
   "verify_save_rtlsim_waveforms": true,
+  "force_python_rtlsim": true,
   "verify_steps": [
     "initial_python",
     "streamlined_python",
diff --git a/src/finn/qnn-data/build_dataflow/expected_output.npy b/src/finn/qnn-data/build_dataflow/expected_output.npy
index a8d0938463..98037351bb 100644
Binary files a/src/finn/qnn-data/build_dataflow/expected_output.npy and b/src/finn/qnn-data/build_dataflow/expected_output.npy differ
diff --git a/src/finn/qnn-data/build_dataflow/input.npy b/src/finn/qnn-data/build_dataflow/input.npy
index edd24de05a..8bece67b7d 100644
Binary files a/src/finn/qnn-data/build_dataflow/input.npy and b/src/finn/qnn-data/build_dataflow/input.npy differ
diff --git a/src/finn/qnn-data/cpp/verilator_fifosim.cpp b/src/finn/qnn-data/cpp/verilator_fifosim.cpp
new file mode 100644
index 0000000000..d0aca9efe7
--- /dev/null
+++ b/src/finn/qnn-data/cpp/verilator_fifosim.cpp
@@ -0,0 +1,197 @@
+/* Copyright (C) 2022, Advanced Micro Devices, Inc.
+All rights reserved.
+#
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+#
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+#
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+#
+* Neither the name of FINN nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+#
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
+
+#include <iostream>
+#include <fstream>
+#include <cstddef>
+#include <chrono>
+#include "verilated.h"
+#include "verilated_vcd_c.h"
+#include "Vfinn_design_wrapper.h"
+
+#ifdef DEBUG
+#define TRACE(x) x
+#else
+#define TRACE(x) ;
+#endif
+
+using namespace std;
+
+Vfinn_design_wrapper * top;
+
+// code taken from pyverilator_wrapper.cpp generated by PyVerilator
+
+// this is required by verilator for verilog designs using $time
+// main_time is incremented in eval
+double main_time = 0;
+
+double sc_time_stamp() {
+return main_time;
+}
+// function definitions
+// helper functions for basic verilator tasks
+extern "C" { //Open an extern C closed below
+Vfinn_design_wrapper* construct() {
+    Verilated::commandArgs(0, (const char**) nullptr);
+    TRACE(Verilated::traceEverOn(true));
+    Vfinn_design_wrapper* top = new Vfinn_design_wrapper();
+    return top;
+}
+int eval(Vfinn_design_wrapper* top) {
+    top->eval();
+    main_time++;
+    return 0;
+}
+int destruct(Vfinn_design_wrapper* top) {
+    if (top != nullptr) {
+        delete top;
+        top = nullptr;
+    }
+    return 0;
+}
+
+TRACE(
+VerilatedVcdC* tfp;
+VerilatedVcdC* start_vcd_trace(Vfinn_design_wrapper* top, const char* filename) {
+    VerilatedVcdC* tfp = new VerilatedVcdC;
+    top->trace(tfp, 99);
+    tfp->open(filename);
+    return tfp;
+}
+int add_to_vcd_trace(VerilatedVcdC* tfp, int time) {
+    tfp->dump(time);
+    return 0;
+}
+int flush_vcd_trace(VerilatedVcdC* tfp) {
+    tfp->flush();
+    return 0;
+}
+int stop_vcd_trace(VerilatedVcdC* tfp) {
+    tfp->close();
+    return 0;
+}
+)
+
+}
+
+// end of code taken from pyverilator_wrapper.cpp generated by PyVerilator
+
+inline void toggle_clk() {
+    eval(top);
+    top->ap_clk = 1;
+    TRACE(add_to_vcd_trace(tfp, main_time));
+    eval(top);
+    top->ap_clk = 0;
+    TRACE(add_to_vcd_trace(tfp, main_time));
+}
+
+
+void reset() {
+    top->ap_rst_n = 0;
+    for(unsigned i = 0; i < 10; i++) {
+        toggle_clk();
+    }
+    top->ap_rst_n = 1;
+}
+
+int main(int argc, char *argv[]) {
+    top = construct();
+    TRACE(tfp = start_vcd_trace(top, "trace.vcd"));
+    unsigned n_iters_per_input = @ITERS_PER_INPUT@;
+    unsigned n_iters_per_output = @ITERS_PER_OUTPUT@;
+    unsigned n_inputs = @N_INPUTS@;
+    unsigned max_iters = @MAX_ITERS@;
+
+    reset();
+
+    top->m_axis_0_tready = 1;
+    top->s_axis_0_tvalid = 1;
+
+    unsigned n_in_txns = 0, n_out_txns = 0, iters = 0, last_output_at = 0;
+    unsigned latency = 0;
+
+    bool exit_criterion = false;
+
+    cout << "Simulation starting" << endl;
+    cout << "Number of inputs to write " << n_iters_per_input * n_inputs << endl;
+    cout << "Number of outputs to expect " << n_iters_per_output * n_inputs << endl;
+    cout << "No-output timeout clock cycles " << max_iters << endl;
+
+    chrono::steady_clock::time_point begin = chrono::steady_clock::now();
+
+    while(!exit_criterion) {
+        toggle_clk();
+        iters++;
+        if(iters % 1000 == 0) {
+            cout << "Elapsed iters " << iters << " inps " << n_in_txns << " outs " << n_out_txns << endl;
+            chrono::steady_clock::time_point end = chrono::steady_clock::now();
+            cout << "Elapsed since last report = " << chrono::duration_cast<chrono::seconds>(end - begin).count() << "[s]" << endl;
+            begin = end;
+        }
+        if(top->s_axis_0_tready == 1 && top->s_axis_0_tvalid == 1) {
+            n_in_txns++;
+            if(n_in_txns == n_iters_per_input * n_inputs) {
+                top->s_axis_0_tvalid = 0;
+                cout << "All inputs written at cycle " << iters << endl;
+            }
+        }
+        if(top->m_axis_0_tvalid == 1) {
+            n_out_txns++;
+            last_output_at = iters;
+            if(n_out_txns == n_iters_per_output) {
+                latency = iters;
+            }
+        }
+
+        exit_criterion = ((n_in_txns >= n_iters_per_input * n_inputs) && (n_out_txns >= n_iters_per_output * n_inputs)) || ((iters-last_output_at) > max_iters);
+    }
+
+    TRACE(flush_vcd_trace(tfp));
+    TRACE(stop_vcd_trace(tfp));
+
+    cout << "Simulation finished" << endl;
+    cout << "Number of inputs consumed " << n_in_txns << endl;
+    cout << "Number of outputs produced " << n_out_txns << endl;
+    cout << "Number of clock cycles " << iters << endl;
+
+    ofstream results_file;
+    results_file.open("results.txt", ios::out | ios::trunc);
+    results_file << "N_IN_TXNS" << "\t" << n_in_txns << endl;
+    results_file << "N_OUT_TXNS" << "\t" << n_out_txns << endl;
+    results_file << "cycles" << "\t" << iters << endl;
+    results_file << "N" << "\t" << n_inputs << endl;
+    results_file << "latency_cycles" << "\t" << latency << endl;
+@FIFO_DEPTH_LOGGING@
+    results_file.close();
+
+
+
+    destruct(top);
+
+    return 0;
+}
diff --git a/src/finn/qnn-data/cybsec-mlp/validate-unsw-nb15.py b/src/finn/qnn-data/cybsec-mlp/validate-unsw-nb15.py
index be09abad9c..e0e2a75f19 100644
--- a/src/finn/qnn-data/cybsec-mlp/validate-unsw-nb15.py
+++ b/src/finn/qnn-data/cybsec-mlp/validate-unsw-nb15.py
@@ -57,9 +57,7 @@ def make_unsw_nb15_test_batches(bsize, dataset_root, limit_batches):
         help='name of bitfile (i.e. "resizer.bit")',
         default="../bitfile/finn-accel.bit",
     )
-    parser.add_argument(
-        "--dataset_root", help="dataset root dir for download/reuse", default="."
-    )
+    parser.add_argument("--dataset_root", help="dataset root dir for download/reuse", default=".")
     parser.add_argument(
         "--limit_batches", help="number of batches, -1 for max", type=int, default=-1
     )
@@ -72,9 +70,7 @@ def make_unsw_nb15_test_batches(bsize, dataset_root, limit_batches):
     limit_batches = args.limit_batches
 
     print("Loading dataset...")
-    (test_imgs, test_labels) = make_unsw_nb15_test_batches(
-        bsize, dataset_root, limit_batches
-    )
+    (test_imgs, test_labels) = make_unsw_nb15_test_batches(bsize, dataset_root, limit_batches)
 
     ok = 0
     nok = 0
diff --git a/src/finn/qnn-data/templates/driver/driver_base.py b/src/finn/qnn-data/templates/driver/driver_base.py
index 2096760580..f701122885 100644
--- a/src/finn/qnn-data/templates/driver/driver_base.py
+++ b/src/finn/qnn-data/templates/driver/driver_base.py
@@ -122,7 +122,7 @@ def load_external_weights(self):
         w_filenames = []
         if not os.path.isdir(self.runtime_weight_dir):
             return
-        for (dirpath, dirnames, filenames) in os.walk(self.runtime_weight_dir):
+        for dirpath, dirnames, filenames in os.walk(self.runtime_weight_dir):
             w_filenames.extend(filenames)
 
         tmp_weight_dict = {}
@@ -173,7 +173,7 @@ def load_runtime_weights(self, flush_accel=True, verify=True):
         w_filenames = []
         if not os.path.isdir(self.runtime_weight_dir):
             return
-        for (dirpath, dirnames, filenames) in os.walk(self.runtime_weight_dir):
+        for dirpath, dirnames, filenames in os.walk(self.runtime_weight_dir):
             w_filenames.extend(filenames)
         rt_weight_dict = {}
         for w_filename in w_filenames:
@@ -182,22 +182,14 @@ def load_runtime_weights(self, flush_accel=True, verify=True):
                     dat = f.read()
             else:
                 continue
-            layer_w = np.fromiter(
-                [int(x, 16) for x in dat.strip().split()], dtype=np.uint32
-            )
+            layer_w = np.fromiter([int(x, 16) for x in dat.strip().split()], dtype=np.uint32)
             sdp_ind = int(w_filename.split("_")[0])
             layer_ind = int(w_filename.split("_")[1])
             rt_weight_dict[(sdp_ind, layer_ind)] = layer_w
         for sdp_ind, layer_ind in rt_weight_dict.keys():
-            cand_if_name = "StreamingDataflowPartition_%d/s_axilite_%d" % (
-                sdp_ind,
-                layer_ind,
-            )
+            cand_if_name = "StreamingDataflowPartition_%d" % sdp_ind
             if cand_if_name in self.ip_dict.keys():
-                layer_mmio = getattr(
-                    getattr(self, "StreamingDataflowPartition_%d" % sdp_ind),
-                    "s_axilite_%d" % layer_ind,
-                ).mmio
+                layer_mmio = getattr(self, "StreamingDataflowPartition_%d" % sdp_ind).mmio
                 layer_w = rt_weight_dict[(sdp_ind, layer_ind)]
                 layer_mmio.write_mm(0, layer_w.tobytes())
                 if verify:
@@ -346,9 +338,7 @@ def execute_on_buffers(self, asynch=False, batch_size=None):
         assert batch_size <= self.batch_size, "Specified batch_size is too large."
         if self.platform == "zynq-iodma":
             for o in range(self.num_outputs):
-                assert (
-                    self.odma[o].read(0x00) & 0x4 != 0
-                ), "Output DMA %d is not idle" % (o)
+                assert self.odma[o].read(0x00) & 0x4 != 0, "Output DMA %d is not idle" % (o)
             # manually launch IODMAs since signatures are missing
             for iwdma, iwbuf, iwdma_name in self.external_weights:
                 iwdma.write(0x10, iwbuf.device_address)
@@ -364,17 +354,13 @@ def execute_on_buffers(self, asynch=False, batch_size=None):
                 self.idma[i].write(0x00, 1)
         elif self.platform == "alveo":
             for o in range(self.num_outputs):
-                assert self.odma_handle[o] is None, (
-                    "Output DMA %d is already running" % o
-                )
+                assert self.odma_handle[o] is None, "Output DMA %d is already running" % o
             for i in range(self.num_inputs):
                 self.idma[i].start(self.ibuf_packed_device[i], batch_size)
             for iwdma, iwbuf, iwdma_name in self.external_weights:
                 iwdma.start(iwbuf, batch_size)
             for o in range(self.num_outputs):
-                self.odma_handle[o] = self.odma[o].start(
-                    self.obuf_packed_device[o], batch_size
-                )
+                self.odma_handle[o] = self.odma[o].start(self.obuf_packed_device[o], batch_size)
         else:
             raise Exception("Unrecognized platform: %s" % self.platform)
         # blocking behavior depends on asynch parameter
@@ -390,9 +376,7 @@ def wait_until_finished(self):
                 while status & 0x2 == 0:
                     status = self.odma[o].read(0x00)
         elif self.platform == "alveo":
-            assert all(
-                [x is not None for x in self.odma_handle]
-            ), "No odma_handle to wait on"
+            assert all([x is not None for x in self.odma_handle]), "No odma_handle to wait on"
             for o in range(self.num_outputs):
                 self.odma_handle[o].wait()
                 self.odma_handle[o] = None
@@ -406,9 +390,7 @@ def execute(self, input_npy):
         # if single input, convert to list to normalize how we process the input
         if not type(input_npy) is list:
             input_npy = [input_npy]
-        assert self.num_inputs == len(
-            input_npy
-        ), "Not all accelerator inputs are specified."
+        assert self.num_inputs == len(input_npy), "Not all accelerator inputs are specified."
         for i in range(self.num_inputs):
             ibuf_folded = self.fold_input(input_npy[i], ind=i)
             ibuf_packed = self.pack_input(ibuf_folded, ind=i)
diff --git a/src/finn/qnn-data/templates/driver/validate.py b/src/finn/qnn-data/templates/driver/validate.py
index 1b29d4342c..c8bc1c009d 100644
--- a/src/finn/qnn-data/templates/driver/validate.py
+++ b/src/finn/qnn-data/templates/driver/validate.py
@@ -38,9 +38,7 @@
     parser.add_argument(
         "--batchsize", help="number of samples for inference", type=int, default=100
     )
-    parser.add_argument(
-        "--dataset", help="dataset to use (mnist of cifar10)", required=True
-    )
+    parser.add_argument("--dataset", help="dataset to use (mnist of cifar10)", required=True)
     parser.add_argument(
         "--platform", help="Target platform: zynq-iodma alveo", default="zynq-iodma"
     )
diff --git a/src/finn/qnn-data/testcase/residual_testcase.onnx b/src/finn/qnn-data/testcase/residual_testcase.onnx
new file mode 100644
index 0000000000..c96e8c694e
Binary files /dev/null and b/src/finn/qnn-data/testcase/residual_testcase.onnx differ
diff --git a/src/finn/qnn-data/verilog/custom_axis_infrastructure.vh b/src/finn/qnn-data/verilog/custom_axis_infrastructure.vh
new file mode 100644
index 0000000000..1c8b6403e8
--- /dev/null
+++ b/src/finn/qnn-data/verilog/custom_axis_infrastructure.vh
@@ -0,0 +1,346 @@
+//  (c) Copyright 2011-2013 Xilinx, Inc. All rights reserved.
+//
+//  This file contains confidential and proprietary information
+//  of Xilinx, Inc. and is protected under U.S. and
+//  international copyright and other intellectual property
+//  laws.
+//
+//  DISCLAIMER
+//  This disclaimer is not a license and does not grant any
+//  rights to the materials distributed herewith. Except as
+//  otherwise provided in a valid license issued to you by
+//  Xilinx, and to the maximum extent permitted by applicable
+//  law: (1) THESE MATERIALS ARE MADE AVAILABLE "AS IS" AND
+//  WITH ALL FAULTS, AND XILINX HEREBY DISCLAIMS ALL WARRANTIES
+//  AND CONDITIONS, EXPRESS, IMPLIED, OR STATUTORY, INCLUDING
+//  BUT NOT LIMITED TO WARRANTIES OF MERCHANTABILITY, NON-
+//  INFRINGEMENT, OR FITNESS FOR ANY PARTICULAR PURPOSE; and
+//  (2) Xilinx shall not be liable (whether in contract or tort,
+//  including negligence, or under any other theory of
+//  liability) for any loss or damage of any kind or nature
+//  related to, arising under or in connection with these
+//  materials, including for any direct, or any indirect,
+//  special, incidental, or consequential loss or damage
+//  (including loss of data, profits, goodwill, or any type of
+//  loss or damage suffered as a result of any action brought
+//  by a third party) even if such damage or loss was
+//  reasonably foreseeable or Xilinx had been advised of the
+//  possibility of the same.
+//
+//  CRITICAL APPLICATIONS
+//  Xilinx products are not designed or intended to be fail-
+//  safe, or for use in any application requiring fail-safe
+//  performance, such as life-support or safety devices or
+//  systems, Class III medical devices, nuclear facilities,
+//  applications related to the deployment of airbags, or any
+//  other applications that could lead to death, personal
+//  injury, or severe property or environmental damage
+//  (individually and collectively, "Critical
+//  Applications"). Customer assumes the sole risk and
+//  liability of any use of Xilinx products in Critical
+//  Applications, subject only to applicable laws and
+//  regulations governing limitations on product liability.
+//
+//  THIS COPYRIGHT NOTICE AND DISCLAIMER MUST BE RETAINED AS
+//  PART OF THIS FILE AT ALL TIMES.
+//-----------------------------------------------------------------------------
+//
+// Generic Functions used by AXIS-Interconnect and Infrastrucutre Modules
+//
+// Verilog-standard:  Verilog 2001
+//--------------------------------------------------------------------------
+// Global Parameters:
+//
+// Functions:
+//   f_clogb2
+//   f_gcd
+//   f_lcm
+//   f_get_tdata_indx
+//   f_get_tstrb_indx
+//   f_get_tkeep_indx
+//   f_get_tlast_indx
+//   f_get_tid_indx
+//   f_get_tdest_indx
+//   f_get_tuser_indx
+//   f_payload_width
+// Tasks:
+//   t_display_tdata_error
+//--------------------------------------------------------------------------
+///////////////////////////////////////////////////////////////////////////////
+// BEGIN Global Parameters
+///////////////////////////////////////////////////////////////////////////////
+// Define Signal Set indices
+localparam G_INDX_SS_TREADY = 0;
+localparam G_INDX_SS_TDATA  = 1;
+localparam G_INDX_SS_TSTRB  = 2;
+localparam G_INDX_SS_TKEEP  = 3;
+localparam G_INDX_SS_TLAST  = 4;
+localparam G_INDX_SS_TID    = 5;
+localparam G_INDX_SS_TDEST  = 6;
+localparam G_INDX_SS_TUSER  = 7;
+localparam G_MASK_SS_TREADY = 32'h1 << G_INDX_SS_TREADY;
+localparam G_MASK_SS_TDATA  = 32'h1 << G_INDX_SS_TDATA;
+localparam G_MASK_SS_TSTRB  = 32'h1 << G_INDX_SS_TSTRB;
+localparam G_MASK_SS_TKEEP  = 32'h1 << G_INDX_SS_TKEEP;
+localparam G_MASK_SS_TLAST  = 32'h1 << G_INDX_SS_TLAST;
+localparam G_MASK_SS_TID    = 32'h1 << G_INDX_SS_TID  ;
+localparam G_MASK_SS_TDEST  = 32'h1 << G_INDX_SS_TDEST;
+localparam G_MASK_SS_TUSER  = 32'h1 << G_INDX_SS_TUSER;
+
+// Task DRC error levels
+localparam G_TASK_SEVERITY_ERR   = 2;
+localparam G_TASK_SEVERITY_WARNING = 1;
+localparam G_TASK_SEVERITY_INFO    = 0;
+
+///////////////////////////////////////////////////////////////////////////////
+// BEGIN Functions
+///////////////////////////////////////////////////////////////////////////////
+// ceiling logb2
+  function integer f_clogb2 (input integer size);
+    integer s;
+    begin
+      s = size;
+      s = s - 1;
+      for (f_clogb2=1; s>1; f_clogb2=f_clogb2+1)
+            s = s >> 1;
+    end
+  endfunction // clogb2
+
+  // Calculates the Greatest Common Divisor between two integers using the
+  // euclidean algorithm.
+  function automatic integer f_gcd (
+    input integer a,
+    input integer b
+    );
+    begin : main
+      integer A, B, done, swap;
+      A = a;
+      B = b;
+      done = 0;
+      while(!done)
+      begin
+        if (A < B ) begin
+          swap = A;
+          A = B;
+          B = swap;
+        end else if ( B != 0 ) begin
+          A = A - B;
+        end else begin
+          done = 1;
+        end
+      end
+
+      f_gcd = A;
+    end
+  endfunction
+
+
+  // Calculates the Lowest Common Denominator between two integers
+  function integer f_lcm (
+    input integer a,
+    input integer b
+    );
+    begin : main
+      f_lcm = ( a / f_gcd(a, b)) * b;
+    end
+  endfunction
+
+  // Returns back the index to the TDATA portion of TPAYLOAD, returns 0 if the
+  // signal is not enabled.
+  function integer f_get_tdata_indx (
+    input integer DAW,  // TDATA Width
+    input integer IDW,  // TID Width
+    input integer DEW,  // TDEST Width
+    input integer USW,  // TUSER Width
+    input [31:0]  SST   // Signal Set
+    );
+    begin : main
+      f_get_tdata_indx = 0;
+    end
+  endfunction
+
+  // Returns back the index to the tstrb portion of TPAYLOAD, returns 0 if the
+  // signal is not enabled.
+  function integer f_get_tstrb_indx (
+    input integer DAW,  // TDATA Width
+    input integer IDW,  // TID Width
+    input integer DEW,  // TDEST Width
+    input integer USW,  // TUSER Width
+    input [31:0]  SST   // Signal Set
+    );
+    begin : main
+      integer cur_indx;
+      cur_indx = f_get_tdata_indx(DAW, IDW, DEW, USW, SST);
+      // If TDATA exists, then add its width to its base to get the tstrb index
+      f_get_tstrb_indx = SST[G_INDX_SS_TDATA] ? cur_indx + DAW : cur_indx;
+    end
+  endfunction
+
+  // Returns back the index to the tkeep portion of TPAYLOAD, returns 0 if the
+  // signal is not enabled.
+  function integer f_get_tkeep_indx (
+    input integer DAW,  // TDATA Width
+    input integer IDW,  // TID Width
+    input integer DEW,  // TDEST Width
+    input integer USW,  // TUSER Width
+    input [31:0]  SST   // Signal Set
+    );
+    begin : main
+      integer cur_indx;
+      cur_indx = f_get_tstrb_indx(DAW, IDW, DEW, USW, SST);
+      f_get_tkeep_indx = SST[G_INDX_SS_TSTRB] ? cur_indx + DAW/8 : cur_indx;
+    end
+  endfunction
+
+  // Returns back the index to the tlast portion of TPAYLOAD, returns 0 if the
+  // signal is not enabled.
+  function integer f_get_tlast_indx (
+    input integer DAW,  // TDATA Width
+    input integer IDW,  // TID Width
+    input integer DEW,  // TDEST Width
+    input integer USW,  // TUSER Width
+    input [31:0]  SST   // Signal Set
+    );
+    begin : main
+      integer cur_indx;
+      cur_indx = f_get_tkeep_indx(DAW, IDW, DEW, USW, SST);
+      f_get_tlast_indx = SST[G_INDX_SS_TKEEP] ? cur_indx + DAW/8 : cur_indx;
+    end
+  endfunction
+
+  // Returns back the index to the tid portion of TPAYLOAD, returns 0 if the
+  // signal is not enabled.
+  function integer f_get_tid_indx (
+    input integer DAW,  // TDATA Width
+    input integer IDW,  // TID Width
+    input integer DEW,  // TDEST Width
+    input integer USW,  // TUSER Width
+    input [31:0]  SST   // Signal Set
+    );
+    begin : main
+      integer cur_indx;
+      cur_indx = f_get_tlast_indx(DAW, IDW, DEW, USW, SST);
+      f_get_tid_indx = SST[G_INDX_SS_TLAST] ? cur_indx + 1 : cur_indx;
+    end
+  endfunction
+
+  // Returns back the index to the tdest portion of TPAYLOAD, returns 0 if the
+  // signal is not enabled.
+  function integer f_get_tdest_indx (
+    input integer DAW,  // TDATA Width
+    input integer IDW,  // TID Width
+    input integer DEW,  // TDEST Width
+    input integer USW,  // TUSER Width
+    input [31:0]  SST   // Signal Set
+    );
+    begin : main
+      integer cur_indx;
+      cur_indx = f_get_tid_indx(DAW, IDW, DEW, USW, SST);
+      f_get_tdest_indx = SST[G_INDX_SS_TID] ? cur_indx + IDW : cur_indx;
+    end
+  endfunction
+
+  // Returns back the index to the tuser portion of TPAYLOAD, returns 0 if the
+  // signal is not enabled.
+  function integer f_get_tuser_indx (
+    input integer DAW,  // TDATA Width
+    input integer IDW,  // TID Width
+    input integer DEW,  // TDEST Width
+    input integer USW,  // TUSER Width
+    input [31:0]  SST   // Signal Set
+    );
+    begin : main
+      integer cur_indx;
+      cur_indx = f_get_tdest_indx(DAW, IDW, DEW, USW, SST);
+      f_get_tuser_indx = SST[G_INDX_SS_TDEST] ? cur_indx + DEW : cur_indx;
+    end
+  endfunction
+
+  // Payload is the sum of all the AXIS signals present except for
+  // TREADY/TVALID
+  function integer f_payload_width (
+    input integer DAW,  // TDATA Width
+    input integer IDW,  // TID Width
+    input integer DEW,  // TDEST Width
+    input integer USW,  // TUSER Width
+    input [31:0]  SST   // Signal Set
+    );
+    begin : main
+      integer cur_indx;
+      cur_indx = f_get_tuser_indx(DAW, IDW, DEW, USW, SST);
+      f_payload_width = SST[G_INDX_SS_TUSER] ? cur_indx + USW : cur_indx;
+      // Ensure that the return value is never less than 1
+      f_payload_width = (f_payload_width < 1) ? 1 : f_payload_width;
+    end
+  endfunction
+
+  task t_check_tdata_width(
+    input  integer    data_width,
+    input  [8*80-1:0] var_name,
+    input  [8*80-1:0] inst_name,
+    input  integer    severity_lvl,
+    output integer    ret_val
+  );
+    // Severity levels:
+    // 0 = INFO
+    // 1 = WARNING
+    // 2 = ERROR
+    begin : t_check_tdata_width
+      if (data_width%8 != 0) begin
+        //       000       1          2         3         4         5         6         7         8
+        //       012       0          0         0         0         0         0         0         0
+        if (severity_lvl >= 2) begin
+        $display("ERROR: %m::%s", inst_name);
+        end else if (severity_lvl == 1) begin
+        $display("WARNING: %m::%s", inst_name);
+        end else begin
+        $display("INFO: %m::%s", inst_name);
+        end
+        $display("       Parameter %s (%2d) must be a multiple of 8.", var_name, data_width);
+        $display("       AXI4-Stream data width is only defined for byte multiples. See the ");
+        $display("       AMBA4 AXI4-Stream Protocol Specification v1.0 Section 2.1 for more");
+        $display("       information.");
+        ret_val = 1;
+      end else begin
+        ret_val = 0;
+      end
+    end
+  endtask
+
+  task t_check_tuser_width(
+    input  integer    tuser_width,
+    input  [8*80-1:0] tuser_name,
+    input  integer    tdata_width,
+    input  [8*80-1:0] tdata_name,
+    input  [8*80-1:0] inst_name,
+    input  integer    severity_lvl,
+    output integer    ret_val
+  );
+    // Severity levels:
+    // 0 = INFO
+    // 1 = WARNING
+    // 2 = ERROR
+    begin : t_check_tuser_width
+      integer tdata_bytes;
+      tdata_bytes = tdata_width/8;
+      if ((tuser_width%tdata_bytes) != 0) begin
+        //       000       1          2         3         4         5         6         7         8
+        //       012       0          0         0         0         0         0         0         0
+        if (severity_lvl >= 2) begin
+        $display("ERROR: %m::%s", inst_name);
+        end else if (severity_lvl == 1) begin
+        $display("WARNING: %m::%s", inst_name);
+        end else begin
+        $display("INFO: %m::%s", inst_name);
+        end
+        $display("       Parameter %s == %2d is not the recommended value of 'an integer ", tuser_name, tuser_width);
+        $display("       multiple of the width of the interface (%s == %2d) in bytes.'  AXI4-Stream", tdata_name, tdata_width);
+        $display("       TUSER width in this module is only defined when the TUSER is the");
+        $display("       recommended value.  See the AMBA4 AXI4-Stream Protocol Specification v1.0");
+        $display("       Section 2.1, 2.3.3 and 2.8 for more information.  ");
+        ret_val = 1;
+      end else begin
+        ret_val = 0;
+      end
+    end
+  endtask
diff --git a/src/finn/transformation/fpgadataflow/annotate_resources.py b/src/finn/transformation/fpgadataflow/annotate_resources.py
index 0cc4234c8c..bb5637f7d3 100644
--- a/src/finn/transformation/fpgadataflow/annotate_resources.py
+++ b/src/finn/transformation/fpgadataflow/annotate_resources.py
@@ -76,9 +76,7 @@ def apply(self, model):
                 # recurse into model to manually annotate per-layer resources
                 sdp_model_filename = getCustomOp(node).get_nodeattr("model")
                 sdp_model = ModelWrapper(sdp_model_filename)
-                sdp_model = sdp_model.transform(
-                    AnnotateResources(self.mode, self.res_dict)
-                )
+                sdp_model = sdp_model.transform(AnnotateResources(self.mode, self.res_dict))
                 sdp_dict = sdp_model.get_metadata_prop("res_total_" + self.mode)
                 sdp_dict = eval(sdp_dict)
                 # save transformed model
diff --git a/src/finn/transformation/fpgadataflow/cleanup.py b/src/finn/transformation/fpgadataflow/cleanup.py
index 1d0efaf4bb..398580c48e 100644
--- a/src/finn/transformation/fpgadataflow/cleanup.py
+++ b/src/finn/transformation/fpgadataflow/cleanup.py
@@ -79,7 +79,5 @@ def apply(self, model):
 
                 except KeyError:
                     # exception if op_type is not supported
-                    raise Exception(
-                        "Custom op_type %s is currently not supported." % op_type
-                    )
+                    raise Exception("Custom op_type %s is currently not supported." % op_type)
         return (model, False)
diff --git a/src/finn/transformation/fpgadataflow/compile_cppsim.py b/src/finn/transformation/fpgadataflow/compile_cppsim.py
index da337caa62..e93a8ec307 100644
--- a/src/finn/transformation/fpgadataflow/compile_cppsim.py
+++ b/src/finn/transformation/fpgadataflow/compile_cppsim.py
@@ -70,7 +70,5 @@ def applyNodeLocal(self, node):
                 in node attribute "executable_path"."""
             except KeyError:
                 # exception if op_type is not supported
-                raise Exception(
-                    "Custom op_type %s is currently not supported." % op_type
-                )
+                raise Exception("Custom op_type %s is currently not supported." % op_type)
         return (node, False)
diff --git a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
index f0bd5fbd06..ef02453498 100644
--- a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
+++ b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
@@ -40,14 +40,14 @@
 from qonnx.util.basic import get_by_name
 from qonnx.util.onnx import nchw_to_nhwc
 
-from finn.transformation.fpgadataflow.minimize_accumulator_width import (
-    MinimizeAccumulatorWidth,
-)
-
 
 class InferConvInpGen(Transformation):
     """Convert Im2Col layers to ConvolutionInputGenerator layers."""
 
+    def __init__(self, use_rtl_variant=False):
+        super().__init__()
+        self.use_rtl_variant = use_rtl_variant
+
     def apply(self, model):
         graph = model.graph
         node_ind = 0
@@ -61,9 +61,7 @@ def apply(self, model):
                 i2c_out_shape = model.get_tensor_shape(i2c_output)
                 dt = model.get_tensor_datatype(i2c_input)
                 if not dt.is_integer():
-                    warnings.warn(
-                        "%s : Input is not int. Can't infer ConvInpGen." % n.name
-                    )
+                    warnings.warn("%s : Input is not int. Can't infer ConvInpGen." % n.name)
                     continue
                 i2c_inst = getCustomOp(n)
                 stride_h, stride_w = i2c_inst.get_nodeattr("stride")
@@ -92,8 +90,7 @@ def apply(self, model):
                     # assert dt.allowed(pad_val),"""FMPadding_Batch DataType
                     # must support pad_val"""
                     assert pad_val == 0, (
-                        "%s : FMPadding_Batch doesn't currently support pad_val!= 0"
-                        % n.name
+                        "%s : FMPadding_Batch doesn't currently support pad_val!= 0" % n.name
                     )
 
                     odim_padding_h = ifm_dim_h + pad_h
@@ -113,8 +110,10 @@ def apply(self, model):
                     ConvInpGen_idim_h = odim_padding_h
                     ConvInpGen_idim_w = odim_padding_w
 
+                    padding_optype = "FMPadding_rtl" if self.use_rtl_variant else "FMPadding_Batch"
+
                     padding_node = helper.make_node(
-                        "FMPadding_Batch",
+                        padding_optype,
                         [i2c_input],
                         [padding_out],
                         domain="finn.custom_op.fpgadataflow",
@@ -128,105 +127,126 @@ def apply(self, model):
                     )
                     graph.node.insert(node_ind, padding_node)
 
-                # Ensure that only supported HLS nodes are inserted
+                is_kernel_pointwise = k_h == 1 and k_w == 1
                 is_square_image = ConvInpGen_idim_h == ConvInpGen_idim_w
                 is_square_kernel = k_h == k_w
-                is_kernel_pointwise = k_h == 1 and k_w == 1
                 is_equal_stride = stride_h == stride_w
                 is_1d_convolution = (k_h == 1 and k_w > 1 and ifm_dim_h == 1) or (
                     k_h > 1 and k_w == 1 and ifm_dim_w == 1
                 )
 
-                if (stride_h > 1 or stride_w > 1) and is_kernel_pointwise:
-                    assert is_square_image, (
-                        "%s : DownSampler currently only supports square input images."
-                        % n.name
-                    )
-                    assert is_equal_stride, (
-                        """%s : DownSampler currently only supports equal stride value
-                        along different axes."""
-                        % n.name
-                    )
-                    ConvInpGen_idim = ConvInpGen_idim_h
-                    stride = stride_h
-                    # create DownSampler node
+                if self.use_rtl_variant:
                     ConvInpGen_node = helper.make_node(
-                        "DownSampler",
+                        "ConvolutionInputGenerator_rtl",
                         [ConvInpGen_input],
                         [i2c_output],
                         domain="finn.custom_op.fpgadataflow",
                         backend="fpgadataflow",
-                        ImgDim=ConvInpGen_idim,
-                        NumChannels=ifm_ch,
+                        ConvKernelDim=[k_h, k_w],
+                        IFMChannels=ifm_ch,
+                        IFMDim=[ConvInpGen_idim_h, ConvInpGen_idim_w],
+                        OFMDim=[ofm_dim_h, ofm_dim_w],
                         SIMD=ifm_ch,
-                        Stride=stride,
+                        M=1,
+                        parallel_window=0,
+                        Stride=[stride_h, stride_w],
+                        Dilation=[dilation_h, dilation_w],
                         inputDataType=dt.name,
-                        name="DownSampler_" + n.name,
+                        outputDataType=dt.name,
+                        depthwise=depthwise,
+                        name="ConvolutionInputGenerator_rtl_" + n.name,
                     )
                     graph.node.insert(ConvInpGen_node_idx, ConvInpGen_node)
                 else:
-                    # create equivalent ConvolutionInputGenerator node
-                    if (
-                        is_square_image and is_square_kernel
-                    ):  # square images and square kernels
-                        assert is_equal_stride, (
-                            """%s: Non-equal strides along different axes is not supported
-                            for (non-)square convolutions"""
-                            % n.name
-                        )
-                        assert dilation_h == 1 and dilation_w == 1, (
-                            """%s: Dilation value != 1 is not supported
-                            for square convolutions"""
-                            % n.name
-                        )
+                    # Ensure that only supported HLS nodes are inserted
+                    if (stride_h > 1 or stride_w > 1) and is_kernel_pointwise:
+                        downsample_1D = (ifm_dim_h == 1) or (ifm_dim_w == 1)
+                        is1D_unitx = ifm_dim_w == 1
+                        downsample_2D = (not downsample_1D) and is_square_image and is_equal_stride
+                        if not (downsample_1D or downsample_2D):
+                            warnings.warn(f"Couldn't infer Downsample from {n.name},check config.")
+                            continue
+                        ConvInpGen_idim = max(ConvInpGen_idim_h, ConvInpGen_idim_w)
+                        stride = max(stride_h, stride_w)
+                        # create DownSampler node
                         ConvInpGen_node = helper.make_node(
-                            "ConvolutionInputGenerator",
+                            "DownSampler",
                             [ConvInpGen_input],
                             [i2c_output],
                             domain="finn.custom_op.fpgadataflow",
                             backend="fpgadataflow",
-                            ConvKernelDim=[k_h, k_w],
-                            IFMChannels=ifm_ch,
-                            IFMDim=[ConvInpGen_idim_h, ConvInpGen_idim_w],
-                            OFMDim=[ofm_dim_h, ofm_dim_w],
+                            ImgDim=ConvInpGen_idim,
+                            NumChannels=ifm_ch,
                             SIMD=ifm_ch,
-                            Stride=[stride_h, stride_w],
-                            Dilation=[dilation_h, dilation_w],
+                            Stride=stride,
                             inputDataType=dt.name,
-                            outputDataType=dt.name,
-                            depthwise=depthwise,
-                            name="ConvolutionInputGenerator_" + n.name,
-                        )
-                    else:  # 1D images and/or kernels
-                        assert is_1d_convolution, (
-                            "%s: ConvolutionInputGenerator1D works only for 1D convs"
-                            % n.name
+                            name="DownSampler_" + n.name,
+                            is1D=downsample_1D,
+                            is1D_unitx=is1D_unitx,
                         )
-                        if dilation_h > 1 or dilation_w > 1:
-                            assert depthwise == 1, (
-                                """%s: Dilation value > 1 is only supported for
-                                1D depthwise separable convolutions"""
+                        graph.node.insert(ConvInpGen_node_idx, ConvInpGen_node)
+                    else:
+                        # create equivalent ConvolutionInputGenerator node
+                        if is_square_image and is_square_kernel:  # square images and square kernels
+                            assert is_equal_stride, (
+                                """%s: Non-equal strides along different axes is not supported
+                                for (non-)square convolutions"""
                                 % n.name
                             )
-                        ConvInpGen_node = helper.make_node(
-                            "ConvolutionInputGenerator1D",
-                            [ConvInpGen_input],
-                            [i2c_output],
-                            domain="finn.custom_op.fpgadataflow",
-                            backend="fpgadataflow",
-                            ConvKernelDim=[k_h, k_w],
-                            IFMChannels=ifm_ch,
-                            IFMDim=[ConvInpGen_idim_h, ConvInpGen_idim_w],
-                            OFMDim=[ofm_dim_h, ofm_dim_w],
-                            SIMD=ifm_ch,
-                            Stride=[stride_h, stride_w],
-                            Dilation=[dilation_h, dilation_w],
-                            inputDataType=dt.name,
-                            outputDataType=dt.name,
-                            depthwise=depthwise,
-                            name="ConvolutionInputGenerator1D_" + n.name,
-                        )
-                    graph.node.insert(ConvInpGen_node_idx, ConvInpGen_node)
+                            assert dilation_h == 1 and dilation_w == 1, (
+                                """%s: Dilation value != 1 is not supported
+                                for square convolutions"""
+                                % n.name
+                            )
+                            ConvInpGen_node = helper.make_node(
+                                "ConvolutionInputGenerator",
+                                [ConvInpGen_input],
+                                [i2c_output],
+                                domain="finn.custom_op.fpgadataflow",
+                                backend="fpgadataflow",
+                                ConvKernelDim=[k_h, k_w],
+                                IFMChannels=ifm_ch,
+                                IFMDim=[ConvInpGen_idim_h, ConvInpGen_idim_w],
+                                OFMDim=[ofm_dim_h, ofm_dim_w],
+                                SIMD=ifm_ch,
+                                Stride=[stride_h, stride_w],
+                                Dilation=[dilation_h, dilation_w],
+                                inputDataType=dt.name,
+                                outputDataType=dt.name,
+                                depthwise=depthwise,
+                                name="ConvolutionInputGenerator_" + n.name,
+                            )
+                        else:  # 1D images and/or kernels
+                            assert is_1d_convolution, (
+                                """%s: ConvolutionInputGenerator1D works only
+                                for 1D convs"""
+                                % n.name
+                            )
+                            if dilation_h > 1 or dilation_w > 1:
+                                assert depthwise == 1, (
+                                    """%s: Dilation value > 1 is only supported for
+                                    1D depthwise separable convolutions"""
+                                    % n.name
+                                )
+                            ConvInpGen_node = helper.make_node(
+                                "ConvolutionInputGenerator1D",
+                                [ConvInpGen_input],
+                                [i2c_output],
+                                domain="finn.custom_op.fpgadataflow",
+                                backend="fpgadataflow",
+                                ConvKernelDim=[k_h, k_w],
+                                IFMChannels=ifm_ch,
+                                IFMDim=[ConvInpGen_idim_h, ConvInpGen_idim_w],
+                                OFMDim=[ofm_dim_h, ofm_dim_w],
+                                SIMD=ifm_ch,
+                                Stride=[stride_h, stride_w],
+                                Dilation=[dilation_h, dilation_w],
+                                inputDataType=dt.name,
+                                outputDataType=dt.name,
+                                depthwise=depthwise,
+                                name="ConvolutionInputGenerator1D_" + n.name,
+                            )
+                        graph.node.insert(ConvInpGen_node_idx, ConvInpGen_node)
                 # remove old nodes
                 graph.node.remove(n)
                 graph_modified = True
@@ -259,15 +279,13 @@ def apply(self, model):
                 dt = model.get_tensor_datatype(n.input[0])
                 if not dt.is_integer():
                     warnings.warn(
-                        "%s: Input not int. Can't infer UpsampleNearestNeighbour."
-                        % n.name
+                        "%s: Input not int. Can't infer UpsampleNearestNeighbour." % n.name
                     )
                     continue
 
                 if model.get_tensor_layout(n.input[0]) != DataLayout.NHWC:
                     warnings.warn(
-                        "%s: Input not NHWC. Can't infer UpsampleNearestNeighbour."
-                        % n.name
+                        "%s: Input not NHWC. Can't infer UpsampleNearestNeighbour." % n.name
                     )
                     continue
 
@@ -285,21 +303,24 @@ def apply(self, model):
                 )
 
                 # Assumes nhwc layout for scales and input
-                assert scales[1] == scales[2], (
-                    "%s: Upsampling is only supported for quadratic scales." % n.name
+                is_scale_square_2d = scales[1] == scales[2]
+                is_scale_1d = scales[1] > 1 and scales[2] == 1
+                assert is_scale_square_2d or is_scale_1d, (
+                    "%s: Upsampling only supported for 1D H, or 2D square scaling" % n.name
                 )
                 assert scales[0] == scales[3] == 1, (
                     n.name + ": Upsampling is only supported for scales with "
-                    "the first and last dimensions being 1."
+                    "the first and last dimensions being 1 in NHWC."
                 )
                 spatial_scale = scales[1]
                 assert spatial_scale == int(spatial_scale), (
                     "%s: Upsampling is only supported for integer scales." % n.name
                 )
+                is_shape_square_2d = in_shape[1] == in_shape[2]
+                is_shape_1d = in_shape[1] > 1 and in_shape[2] == 1
 
-                assert in_shape[1] == in_shape[2], (
-                    "%s: Upsampling is only supported for quadratic input shapes."
-                    % n.name
+                assert is_shape_square_2d or is_shape_1d, (
+                    "%s: Upsampling is only supported for 1D H or 2D square inputs." % n.name
                 )
 
                 # Extract information for HLS node
@@ -308,6 +329,7 @@ def apply(self, model):
                 NumChannels = in_shape[-1]
                 numInputVectors = in_shape[0]
                 inputDataType = dt.name
+                dim_mode = 0 if is_shape_square_2d else 1
 
                 # Insert the HLSCustomOp node
                 Upsample_HLS_node = helper.make_node(
@@ -321,6 +343,7 @@ def apply(self, model):
                     NumChannels=NumChannels,
                     inputDataType=inputDataType,
                     numInputVectors=numInputVectors,
+                    DimMode=dim_mode,
                     name="UpsampleNearestNeighbour_Batch_" + n.name,
                 )
 
@@ -500,9 +523,7 @@ def apply(self, model):
                 elif node.op_type == "QuantAvgPool2d":
                     assert odt.is_integer(), """Output data type for QuantAvgPool2d
                     needs to be integer"""
-                    assert all(
-                        x == 0 for x in pad
-                    ), "Padding is not supported for QuantAvgPool2d"
+                    assert all(x == 0 for x in pad), "Padding is not supported for QuantAvgPool2d"
                     inst = getCustomOp(node)
                     pool_fxn = "QuantAvgPool"
                     pool_size_param = inst.get_shifts()
@@ -510,9 +531,7 @@ def apply(self, model):
 
                 else:
                     raise Exception(
-                        "pad_value and pool_fxn not configured for {}".format(
-                            node.op_type
-                        )
+                        "pad_value and pool_fxn not configured for {}".format(node.op_type)
                     )
 
                 # format input tensor
@@ -707,7 +726,6 @@ def apply(self, model):
                     graph.node.remove(n)
                     graph_modified = True
         if graph_modified:
-            model = model.transform(MinimizeAccumulatorWidth())
             model = model.transform(InferShapes())
             model = model.transform(InferDataTypes())
         return (model, graph_modified)
@@ -772,17 +790,13 @@ def apply(self, model):
                         scale = getCustomOp(consumer).get_nodeattr("out_scale")
                         actval = getCustomOp(consumer).get_nodeattr("out_bias")
                         assert int(actval) == actval, (
-                            consumer.name
-                            + ": out_bias must be integer for HLS conversion."
+                            consumer.name + ": out_bias must be integer for HLS conversion."
                         )
                         actval = int(actval)
                         odt_is_bipolar = odt == DataType["BIPOLAR"]
-                        bipolar_ok = (
-                            odt_is_bipolar and (scale == 2.0) and (actval == -1)
-                        )
+                        bipolar_ok = odt_is_bipolar and (scale == 2.0) and (actval == -1)
                         assert scale == 1.0 or bipolar_ok, (
-                            consumer.name
-                            + ": out_scale=1 or bipolar output needed for conversion."
+                            consumer.name + ": out_scale=1 or bipolar output needed for conversion."
                         )
                         assert (not odt.signed()) or (actval < 0), (
                             consumer.name + ": Signed output requres actval < 0"
@@ -850,7 +864,6 @@ def apply(self, model):
                         graph.node.remove(n)
                         graph_modified = True
         if graph_modified:
-            model = model.transform(MinimizeAccumulatorWidth())
             model = model.transform(InferShapes())
             model = model.transform(InferDataTypes())
         return (model, graph_modified)
@@ -863,16 +876,17 @@ class InferVectorVectorActivation(Transformation):
     a depthwise convolution. Any immediately following MultiThreshold
     layers will also be absorbed into the VVAU."""
 
+    def __init__(self, mem_mode="const"):
+        super().__init__()
+        self.mem_mode = mem_mode
+
     def apply(self, model):
         graph = model.graph
         node_ind = 0
         graph_modified = False
         for n in graph.node:
             node_ind += 1
-            if (
-                n.op_type == "MatMul"
-                and model.get_tensor_sparsity(n.input[1]) is not None
-            ):
+            if n.op_type == "MatMul" and model.get_tensor_sparsity(n.input[1]) is not None:
                 sparsity = model.get_tensor_sparsity(n.input[1])
                 try:
                     k_h, k_w = sparsity["dw"]["kernel_shape"]
@@ -931,13 +945,11 @@ def apply(self, model):
                         odt = model.get_tensor_datatype(mt_output)
                         scale = getCustomOp(consumer).get_nodeattr("out_scale")
                         assert scale == 1.0, (
-                            consumer.name
-                            + ": out_scale must be equal to 1.0 for HLS conversion."
+                            consumer.name + ": out_scale must be equal to 1.0 for HLS conversion."
                         )
                         actval = getCustomOp(consumer).get_nodeattr("out_bias")
                         assert int(actval) == actval, (
-                            consumer.name
-                            + ": out_bias must be integer for HLS conversion."
+                            consumer.name + ": out_bias must be integer for HLS conversion."
                         )
                         actval = int(actval)
                         assert (not odt.signed()) or (actval < 0), (
@@ -963,6 +975,7 @@ def apply(self, model):
                             ActVal=actval,
                             noActivation=0,
                             name="VectorVectorActivation_" + n.name,
+                            mem_mode=self.mem_mode,
                         )
                         graph.node.insert(node_ind, new_node)
                         # remove old nodes
@@ -998,7 +1011,6 @@ def apply(self, model):
                         graph.node.remove(n)
                         graph_modified = True
         if graph_modified:
-            model = model.transform(MinimizeAccumulatorWidth())
             model = model.transform(InferShapes())
             model = model.transform(InferDataTypes())
         return (model, graph_modified)
@@ -1053,13 +1065,11 @@ def apply(self, model):
                 odt = model.get_tensor_datatype(thl_output)
                 scale = getCustomOp(node).get_nodeattr("out_scale")
                 assert scale == 1.0, (
-                    node.name
-                    + ": MultiThreshold out_scale must be 1 for HLS conversion."
+                    node.name + ": MultiThreshold out_scale must be 1 for HLS conversion."
                 )
                 actval = getCustomOp(node).get_nodeattr("out_bias")
                 assert int(actval) == actval, (
-                    node.name
-                    + ": MultiThreshold out_bias must be integer for HLS conversion."
+                    node.name + ": MultiThreshold out_bias must be integer for HLS conversion."
                 )
                 actval = int(actval)
                 assert (not odt.signed()) or (actval < 0), (
@@ -1076,7 +1086,8 @@ def apply(self, model):
                     PE=pe,
                     numSteps=thl_thres_shape[1],
                     inputDataType=idt.name,
-                    weightDataType=idt.name,  # will be set by MinimizeAccumulatorWidth
+                    # weightDataType can be tightened by MinimizeAccumulatorWidth
+                    weightDataType=idt.name,
                     outputDataType=odt.name,
                     numInputVectors=list(thl_in_shape[:-1]),
                     ActVal=actval,
@@ -1089,7 +1100,6 @@ def apply(self, model):
                 graph_modified = True
 
         if graph_modified:
-            model = model.transform(MinimizeAccumulatorWidth())
             model = model.transform(InferShapes())
             model = model.transform(InferDataTypes())
         return (model, graph_modified)
@@ -1110,10 +1120,16 @@ def apply(self, model):
                 result = node.output[0]
                 in0_shape = model.get_tensor_shape(in0)
                 in1_shape = model.get_tensor_shape(in1)
+                in0_static = not (model.get_initializer(in0) is None)
+                in1_static = not (model.get_initializer(in1) is None)
 
                 # skip if different shapes on inputs
                 if in0_shape != in1_shape:
                     continue
+                # skip if any of inputs have initializers
+                # (this node is meant for adding two dynamic streams)
+                if in0_static or in1_static:
+                    continue
 
                 idt0 = model.get_tensor_datatype(in0)
                 idt1 = model.get_tensor_datatype(in1)
@@ -1227,6 +1243,7 @@ def apply(self, model):
                     inputDataType=dt.name,
                     numInputVectors=vecs,
                     NumOutputStreams=n_outputs,
+                    outFIFODepths=[2] * n_outputs,
                     name="DuplicateStreams_Batch_" + node.name,
                 )
 
@@ -1322,9 +1339,7 @@ def apply(self, model):
                 # check if the shape of initializer is compatible
                 ll_cinit_shape = list(ll_cinit.shape)
                 if np.prod(ll_cinit_shape) == 1:
-                    warnings.warn(
-                        "Broadcasting " + str(node.op_type) + "(" + node.name + ")"
-                    )
+                    warnings.warn("Broadcasting " + str(node.op_type) + "(" + node.name + ")")
                     ll_cinit = np.full((ch), ll_cinit.flatten()[0])
                 elif np.prod(ll_cinit_shape) != ch or ll_cinit_shape[ch_index] != ch:
                     # parameter shape not compatible with Channelwise_batch
@@ -1633,11 +1648,13 @@ def apply(self, model):
                 dt0 = model.get_tensor_datatype(node.input[0])
                 if dt0 is None:
                     continue
-                dt_coherent = all(
-                    [model.get_tensor_datatype(x) == dt0 for x in node.input]
-                )
+                dt_coherent = all([model.get_tensor_datatype(x) == dt0 for x in node.input])
                 if not dt_coherent:
                     continue
+                # skip conversion if any inputs are static
+                all_static = all([model.get_initializer(x) is None for x in node.input])
+                if not all_static:
+                    continue
                 # skip conversion if inputs are not integers
                 if not dt0.is_integer():
                     continue
@@ -1654,6 +1671,7 @@ def apply(self, model):
                     ElemsPerStream=elems_per_stream,
                     inputDataType=dt0.name,
                     numInputVectors=inp_vec,
+                    inFIFODepths=[2] * len(node.input),
                 )
                 graph.node.insert(node_ind, new_node)
                 # remove old node
@@ -1664,3 +1682,101 @@ def apply(self, model):
             model = model.transform(InferShapes())
             model = model.transform(InferDataTypes())
         return (model, graph_modified)
+
+
+class InferStreamingEltwise(Transformation):
+    """Convert eltwise Sub or Sub -> Abs to StreamingEltwise layer
+    with SubEltwise or AbsDiffEltwise op."""
+
+    def apply(self, model):
+        graph = model.graph
+        node_ind = 0
+        graph_modified = False
+        for node in graph.node:
+            node_ind += 1
+            if node.op_type == "Sub":
+                in0 = node.input[0]
+                in1 = node.input[1]
+                result = node.output[0]
+                in0_shape = model.get_tensor_shape(in0)
+                in1_shape = model.get_tensor_shape(in1)
+                in0_static = not (model.get_initializer(in0) is None)
+                in1_static = not (model.get_initializer(in1) is None)
+
+                # skip if different shapes on inputs
+                if in0_shape != in1_shape:
+                    continue
+                # skip if any of inputs have initializers
+                # (this node is meant for two dynamic streams)
+                if in0_static or in1_static:
+                    continue
+
+                idt0 = model.get_tensor_datatype(in0)
+                idt1 = model.get_tensor_datatype(in1)
+
+                # skip conversion for layers with float input
+                if not (idt0.is_integer() and idt1.is_integer()):
+                    continue
+
+                eltwiseOp = "Sub"
+                nodes_to_remove = [node]
+                # look for a downstream Abs node
+                res_consumer = model.find_consumer(result)
+                if (res_consumer is not None) and (res_consumer.op_type == "Abs"):
+                    eltwiseOp = "AbsDiff"
+                    result = res_consumer.output[0]
+                    nodes_to_remove.append(res_consumer)
+
+                # check layout and convert if necessary
+                in0_layout = model.get_tensor_layout(in0)
+                in1_layout = model.get_tensor_layout(in1)
+                result_layout = model.get_tensor_layout(result)
+
+                if in0_layout == DataLayout.NCHW:
+                    in0 = nchw_to_nhwc(in0, model, node_ind)
+                    node_ind += 1
+                    in0_shape = model.get_tensor_shape(in0)
+
+                if in1_layout == DataLayout.NCHW:
+                    in1 = nchw_to_nhwc(in1, model, node_ind)
+                    node_ind += 1
+                    in1_shape = model.get_tensor_shape(in1)
+
+                # keep track of where we need to insert the HLS Op
+                # it has to be ahead of the output transform
+                insert_point = node_ind
+
+                if result_layout == DataLayout.NCHW:
+                    result = nchw_to_nhwc(result, model, node_ind, reverse=True)
+                    node_ind += 1
+
+                # now safe to assume num_channels is size of last dimension
+                num_channels = int(in0_shape[-1])
+                # create node with no parallelization first
+                pe = 1
+
+                # create and insert new Eltwise node
+                new_node = helper.make_node(
+                    "StreamingEltwise",
+                    [in0, in1],
+                    [result],
+                    domain="finn.custom_op.fpgadataflow",
+                    backend="fpgadataflow",
+                    NumChannels=num_channels,
+                    PE=pe,
+                    inputDataType0=idt0.name,
+                    inputDataType1=idt1.name,
+                    eltwiseOp=eltwiseOp,
+                    numInputVectors=in0_shape[:-1],
+                    name="StreamingEltwise_" + node.name,
+                )
+                graph.node.insert(insert_point, new_node)
+                # remove old nodes
+                for nd in nodes_to_remove:
+                    graph.node.remove(nd)
+                graph_modified = True
+
+        # if graph_modified:
+        # model = model.transform(InferShapes())
+        # model = model.transform(InferDataTypes())
+        return (model, graph_modified)
diff --git a/src/finn/transformation/fpgadataflow/create_stitched_ip.py b/src/finn/transformation/fpgadataflow/create_stitched_ip.py
index 892ab09fdf..9a653fe404 100644
--- a/src/finn/transformation/fpgadataflow/create_stitched_ip.py
+++ b/src/finn/transformation/fpgadataflow/create_stitched_ip.py
@@ -26,8 +26,6 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import pkg_resources as pk
-
 import json
 import multiprocessing as mp
 import os
@@ -86,9 +84,7 @@ class CreateStitchedIP(Transformation):
     The packaged block design IP can be found under the ip subdirectory.
     """
 
-    def __init__(
-        self, fpgapart, clk_ns, ip_name="finn_design", vitis=False, signature=[]
-    ):
+    def __init__(self, fpgapart, clk_ns, ip_name="finn_design", vitis=False, signature=[]):
         super().__init__()
         self.fpgapart = fpgapart
         self.clk_ns = clk_ns
@@ -121,17 +117,13 @@ def connect_clk_rst(self, node):
         # make clock and reset external, if they aren't already
         if not self.clock_reset_are_external:
             self.connect_cmds.append(
-                "make_bd_pins_external [get_bd_pins %s/%s]"
-                % (inst_name, clock_intf_name)
+                "make_bd_pins_external [get_bd_pins %s/%s]" % (inst_name, clock_intf_name)
             )
             self.connect_cmds.append("set_property name ap_clk [get_bd_ports ap_clk_0]")
             self.connect_cmds.append(
-                "make_bd_pins_external [get_bd_pins %s/%s]"
-                % (inst_name, reset_intf_name)
-            )
-            self.connect_cmds.append(
-                "set_property name ap_rst_n [get_bd_ports ap_rst_n_0]"
+                "make_bd_pins_external [get_bd_pins %s/%s]" % (inst_name, reset_intf_name)
             )
+            self.connect_cmds.append("set_property name ap_rst_n [get_bd_ports ap_rst_n_0]")
             self.clock_reset_are_external = True
             self.intf_names["clk"] = ["ap_clk"]
             self.intf_names["rst"] = ["ap_rst_n"]
@@ -172,13 +164,9 @@ def connect_axi(self, node):
             )
             self.connect_cmds.append("assign_bd_address")
             seg_name = "%s/Data_m_axi_gmem/SEG_%s_Reg" % (inst_name, ext_if_name)
-            self.connect_cmds.append(
-                "set_property offset 0 [get_bd_addr_segs {%s}]" % (seg_name)
-            )
+            self.connect_cmds.append("set_property offset 0 [get_bd_addr_segs {%s}]" % (seg_name))
             # TODO should propagate this information from the node instead of 4G
-            self.connect_cmds.append(
-                "set_property range 4G [get_bd_addr_segs {%s}]" % (seg_name)
-            )
+            self.connect_cmds.append("set_property range 4G [get_bd_addr_segs {%s}]" % (seg_name))
             self.intf_names["aximm"] = [(ext_if_name, aximm_intf_name[0][1])]
             self.has_aximm = True
 
@@ -215,8 +203,7 @@ def connect_s_axis_external(self, node, idx=None):
                 continue
             input_intf_name = input_intf_names[i][0]
             self.connect_cmds.append(
-                "make_bd_intf_pins_external [get_bd_intf_pins %s/%s]"
-                % (inst_name, input_intf_name)
+                "make_bd_intf_pins_external [get_bd_intf_pins %s/%s]" % (inst_name, input_intf_name)
             )
             self.connect_cmds.append(
                 "set_property name s_axis_%d [get_bd_intf_ports %s_0]"
@@ -228,6 +215,20 @@ def connect_s_axis_external(self, node, idx=None):
             )
             self.s_axis_idx += 1
 
+    def connect_ap_none_external(self, node):
+        inst_name = node.name
+        node_inst = getCustomOp(node)
+        input_intf_names = node_inst.get_verilog_top_module_intf_names()["ap_none"]
+        # make external
+        for i in range(len(input_intf_names)):
+            input_intf_name = input_intf_names[i]
+            self.connect_cmds.append(
+                "make_bd_pins_external [get_bd_pins %s/%s]" % (inst_name, input_intf_name)
+            )
+            self.connect_cmds.append(
+                "set_property name %s [get_bd_ports %s_0]" % (input_intf_name, input_intf_name)
+            )
+
     def insert_signature(self, checksum_count):
         signature_vlnv = "AMD:user:axi_info_top:1.0"
         signature_name = "axi_info_top0"
@@ -251,12 +252,10 @@ def insert_signature(self, checksum_count):
         )
         # set clk and reset
         self.connect_cmds.append(
-            "connect_bd_net [get_bd_ports ap_clk] [get_bd_pins %s/ap_clk]"
-            % signature_name
+            "connect_bd_net [get_bd_ports ap_clk] [get_bd_pins %s/ap_clk]" % signature_name
         )
         self.connect_cmds.append(
-            "connect_bd_net [get_bd_ports ap_rst_n] [get_bd_pins %s/ap_rst_n]"
-            % signature_name
+            "connect_bd_net [get_bd_ports ap_rst_n] [get_bd_pins %s/ap_rst_n]" % signature_name
         )
         fclk_mhz = 1 / (self.clk_ns * 0.001)
         fclk_hz = fclk_mhz * 1000000
@@ -274,9 +273,7 @@ def insert_signature(self, checksum_count):
         self.connect_cmds.append(
             "make_bd_intf_pins_external [get_bd_intf_pins %s/s_axi]" % signature_name
         )
-        self.connect_cmds.append(
-            "set_property name s_axis_info [get_bd_intf_ports s_axi_0]"
-        )
+        self.connect_cmds.append("set_property name s_axilite_info [get_bd_intf_ports s_axi_0]")
         self.connect_cmds.append("assign_bd_address")
 
     def apply(self, model):
@@ -294,17 +291,24 @@ def apply(self, model):
                 behavior. It is strongly recommended to insert FIFOs prior to
                 calling CreateStitchedIP."""
             )
+        if model.graph.node[0].op_type == "StreamingFIFO":
+            firstfifo = getCustomOp(model.graph.node[0])
+            if firstfifo.get_nodeattr("impl_style") == "vivado":
+                warnings.warn(
+                    """First FIFO has impl_style=vivado, which may cause
+                    simulation glitches (e.g. dropping the first input sample
+                    after reset)."""
+                )
         for node in model.graph.node:
             # ensure that all nodes are fpgadataflow, and that IPs are generated
-            assert is_fpgadataflow_node(
-                node
-            ), "All nodes must be FINN fpgadataflow nodes."
+            assert is_fpgadataflow_node(node), "All nodes must be FINN fpgadataflow nodes."
             node_inst = getCustomOp(node)
             ip_dir_value = node_inst.get_nodeattr("ip_path")
             assert os.path.isdir(ip_dir_value), "IP generation directory doesn't exist."
             ip_dirs += [ip_dir_value]
             self.create_cmds += node_inst.code_generation_ipi()
             self.connect_clk_rst(node)
+            self.connect_ap_none_external(node)
             self.connect_axi(node)
             for i in range(len(node.input)):
                 if not is_external_input(model, node, i):
@@ -312,12 +316,10 @@ def apply(self, model):
                     if producer is None:
                         continue
                     j = list(producer.output).index(node.input[i])
-                    src_intf_name = getCustomOp(
-                        producer
-                    ).get_verilog_top_module_intf_names()["m_axis"][j][0]
-                    dst_intf_name = node_inst.get_verilog_top_module_intf_names()[
-                        "s_axis"
-                    ][i][0]
+                    src_intf_name = getCustomOp(producer).get_verilog_top_module_intf_names()[
+                        "m_axis"
+                    ][j][0]
+                    dst_intf_name = node_inst.get_verilog_top_module_intf_names()["s_axis"][i][0]
                     self.connect_cmds.append(
                         "connect_bd_intf_net [get_bd_intf_pins %s/%s] "
                         "[get_bd_intf_pins %s/%s]"
@@ -357,9 +359,10 @@ def apply(self, model):
         tcl = []
         # create vivado project
         tcl.append(
-            "create_project %s %s -part %s"
-            % (prjname, vivado_stitch_proj_dir, self.fpgapart)
+            "create_project %s %s -part %s" % (prjname, vivado_stitch_proj_dir, self.fpgapart)
         )
+        # no warnings on long module names
+        tcl.append("set_msg_config -id {[BD 41-1753]} -suppress")
         # add all the generated IP dirs to ip_repo_paths
         ip_dirs_str = " ".join(ip_dirs)
         tcl.append("set_property ip_repo_paths [%s] [current_project]" % ip_dirs_str)
@@ -372,8 +375,7 @@ def apply(self, model):
         fclk_mhz = 1 / (self.clk_ns * 0.001)
         fclk_hz = fclk_mhz * 1000000
         model.set_metadata_prop("clk_ns", str(self.clk_ns))
-        tcl.append("set_property CONFIG.FREQ_HZ %f [get_bd_ports /ap_clk]" % fclk_hz)
-        tcl.append("regenerate_bd_layout")
+        tcl.append("set_property CONFIG.FREQ_HZ %d [get_bd_ports /ap_clk]" % round(fclk_hz))
         tcl.append("validate_bd_design")
         tcl.append("save_bd_design")
         # create wrapper hdl (for rtlsim later on)
@@ -387,11 +389,11 @@ def apply(self, model):
         wrapper_filename = "%s/hdl/%s_wrapper.v" % (bd_base, block_name)
         tcl.append("add_files -norecurse %s" % wrapper_filename)
         model.set_metadata_prop("wrapper_filename", wrapper_filename)
+        tcl.append("set_property top %s_wrapper [current_fileset]" % block_name)
         # synthesize to DCP and export stub, DCP and constraints
         if self.vitis:
             tcl.append(
-                "set_property SYNTH_CHECKPOINT_MODE Hierarchical [ get_files %s ]"
-                % bd_filename
+                "set_property SYNTH_CHECKPOINT_MODE Hierarchical [ get_files %s ]" % bd_filename
             )
             tcl.append(
                 "set_property -name {STEPS.SYNTH_DESIGN.ARGS.MORE OPTIONS} "
@@ -424,6 +426,8 @@ def apply(self, model):
             )
             % (vivado_stitch_proj_dir, block_vendor, block_library, block_name)
         )
+        # Allow user to customize clock in deployment of stitched IP
+        tcl.append("set_property ipi_drc {ignore_freq_hz true} [ipx::current_core]")
         # in some cases, the IP packager seems to infer an aperture of 64K or 4G,
         # preventing address assignment of the DDR_LOW and/or DDR_HIGH segments
         # the following is a hotfix to remove this aperture during IODMA packaging
@@ -441,16 +445,9 @@ def apply(self, model):
         # if targeting Vitis, add some properties to the IP
         if self.vitis:
             # replace source code with dcp
-            tcl.append(
-                "set_property sdx_kernel true [ipx::find_open_core %s]" % block_vlnv
-            )
-            tcl.append(
-                "set_property sdx_kernel_type rtl [ipx::find_open_core %s]" % block_vlnv
-            )
-            tcl.append(
-                "set_property supported_families { } [ipx::find_open_core %s]"
-                % block_vlnv
-            )
+            tcl.append("set_property sdx_kernel true [ipx::find_open_core %s]" % block_vlnv)
+            tcl.append("set_property sdx_kernel_type rtl [ipx::find_open_core %s]" % block_vlnv)
+            tcl.append("set_property supported_families { } [ipx::find_open_core %s]" % block_vlnv)
             tcl.append(
                 "set_property xpm_libraries {XPM_CDC XPM_MEMORY XPM_FIFO} "
                 "[ipx::find_open_core %s]" % block_vlnv
@@ -465,32 +462,20 @@ def apply(self, model):
                 "ipx::remove_all_file "
                 "[ipx::get_file_groups xilinx_anylanguagebehavioralsimulation]"
             )
-            tcl.append(
-                "ipx::remove_all_file "
-                "[ipx::get_file_groups xilinx_anylanguagesynthesis]"
-            )
+            tcl.append("ipx::remove_all_file " "[ipx::get_file_groups xilinx_anylanguagesynthesis]")
             tcl.append(
                 "ipx::remove_file_group "
                 "xilinx_anylanguagebehavioralsimulation [ipx::current_core]"
             )
-            tcl.append(
-                "ipx::remove_file_group "
-                "xilinx_anylanguagesynthesis [ipx::current_core]"
-            )
+            tcl.append("ipx::remove_file_group " "xilinx_anylanguagesynthesis [ipx::current_core]")
             # remove sim and src folders
             tcl.append("file delete -force %s/ip/sim" % vivado_stitch_proj_dir)
             tcl.append("file delete -force %s/ip/src" % vivado_stitch_proj_dir)
             # copy and add DCP, stub, and xdc
             tcl.append("file mkdir %s/ip/dcp" % vivado_stitch_proj_dir)
             tcl.append("file mkdir %s/ip/impl" % vivado_stitch_proj_dir)
-            tcl.append(
-                "file copy -force %s.dcp %s/ip/dcp"
-                % (block_name, vivado_stitch_proj_dir)
-            )
-            tcl.append(
-                "file copy -force %s.xdc %s/ip/impl"
-                % (block_name, vivado_stitch_proj_dir)
-            )
+            tcl.append("file copy -force %s.dcp %s/ip/dcp" % (block_name, vivado_stitch_proj_dir))
+            tcl.append("file copy -force %s.xdc %s/ip/impl" % (block_name, vivado_stitch_proj_dir))
             tcl.append("ipx::add_file_group xilinx_implementation [ipx::current_core]")
             tcl.append(
                 "ipx::add_file impl/%s.xdc [ipx::get_file_groups xilinx_implementation]"
@@ -501,37 +486,97 @@ def apply(self, model):
                 "[ipx::get_files impl/%s.xdc "
                 "-of_objects [ipx::get_file_groups xilinx_implementation]]" % block_name
             )
-            tcl.append(
-                "ipx::add_file_group " "xilinx_synthesischeckpoint [ipx::current_core]"
-            )
+            tcl.append("ipx::add_file_group " "xilinx_synthesischeckpoint [ipx::current_core]")
             tcl.append(
                 "ipx::add_file dcp/%s.dcp "
                 "[ipx::get_file_groups xilinx_synthesischeckpoint]" % block_name
             )
-            tcl.append(
-                "ipx::add_file_group xilinx_simulationcheckpoint [ipx::current_core]"
-            )
+            tcl.append("ipx::add_file_group xilinx_simulationcheckpoint [ipx::current_core]")
             tcl.append(
                 "ipx::add_file dcp/%s.dcp "
                 "[ipx::get_file_groups xilinx_simulationcheckpoint]" % block_name
             )
         # add a rudimentary driver mdd to get correct ranges in xparameters.h later on
-        example_data_dir = pk.resource_filename("finn.qnn-data", "mdd-data/")
+        example_data_dir = os.environ["FINN_ROOT"] + "/src/finn/qnn-data/mdd-data"
         copytree(example_data_dir, vivado_stitch_proj_dir + "/data")
-        tcl.append("file copy -force data ip/")
-        tcl.append("ipx::add_file_group -type software_driver {} [ipx::current_core]")
-        tcl.append(
-            "set_property type mdd [ipx::add_file data/finn_design.mdd "
-            "[ipx::get_file_groups xilinx_softwaredriver -of_objects "
-            "[ipx::current_core]]]"
-        )
+
+        #####
+        # Core Cleanup Operations
         tcl.append(
-            "set_property type tclSource [ipx::add_file data/finn_design.tcl "
-            "[ipx::get_file_groups xilinx_softwaredriver -of_objects "
-            "[ipx::current_core]]]"
+            """
+set core [ipx::current_core]
+
+# Add rudimentary driver
+file copy -force data ip/
+set file_group [ipx::add_file_group -type software_driver {} $core]
+set_property type mdd       [ipx::add_file data/finn_design.mdd $file_group]
+set_property type tclSource [ipx::add_file data/finn_design.tcl $file_group]
+
+# Remove all XCI references to subcores
+set impl_files [ipx::get_file_groups xilinx_implementation -of $core]
+foreach xci [ipx::get_files -of $impl_files {*.xci}] {
+    ipx::remove_file [get_property NAME $xci] $impl_files
+}
+
+# Construct a single flat memory map for each AXI-lite interface port
+foreach port [get_bd_intf_ports -filter {CONFIG.PROTOCOL==AXI4LITE}] {
+    set pin $port
+    set awidth ""
+    while { $awidth == "" } {
+        set pins [get_bd_intf_pins -of [get_bd_intf_nets -boundary_type lower -of $pin]]
+        set kill [lsearch $pins $pin]
+        if { $kill >= 0 } { set pins [lreplace $pins $kill $kill] }
+        if { [llength $pins] != 1 } { break }
+        set pin [lindex $pins 0]
+        set awidth [get_property CONFIG.ADDR_WIDTH $pin]
+    }
+    if { $awidth == "" } {
+       puts "CRITICAL WARNING: Unable to construct address map for $port."
+    } {
+       set range [expr 2**$awidth]
+       set range [expr $range < 4096 ? 4096 : $range]
+       puts "INFO: Building address map for $port: 0+:$range"
+       set name [get_property NAME $port]
+       set addr_block [ipx::add_address_block Reg0 [ipx::add_memory_map $name $core]]
+       set_property range $range $addr_block
+       set_property slave_memory_map_ref $name [ipx::get_bus_interfaces $name -of $core]
+    }
+}
+
+# Finalize and Save
+ipx::update_checksums $core
+ipx::save_core $core
+
+# Remove stale subcore references from component.xml
+file rename -force ip/component.xml ip/component.bak
+set ifile [open ip/component.bak r]
+set ofile [open ip/component.xml w]
+set buf [list]
+set kill 0
+while { [eof $ifile] != 1 } {
+    gets $ifile line
+    if { [string match {*<spirit:fileSet>*} $line] == 1 } {
+        foreach l $buf { puts $ofile $l }
+        set buf [list $line]
+    } elseif { [llength $buf] > 0 } {
+        lappend buf $line
+
+        if { [string match {*</spirit:fileSet>*} $line] == 1 } {
+            if { $kill == 0 } { foreach l $buf { puts $ofile $l } }
+            set buf [list]
+            set kill 0
+        } elseif { [string match {*<xilinx:subCoreRef>*} $line] == 1 } {
+            set kill 1
+        }
+    } else {
+        puts $ofile $line
+    }
+}
+close $ifile
+close $ofile
+"""
         )
-        tcl.append("ipx::update_checksums [ipx::find_open_core %s]" % block_vlnv)
-        tcl.append("ipx::save_core [ipx::find_open_core %s]" % block_vlnv)
+
         # export list of used Verilog files (for rtlsim later on)
         tcl.append(
             "set all_v_files [get_files -filter {USED_IN_SYNTHESIS == 1 "
@@ -565,6 +610,10 @@ def apply(self, model):
             if os.path.isfile(wrapper_filename_alt):
                 model.set_metadata_prop("wrapper_filename", wrapper_filename_alt)
             else:
-                raise Exception("CreateStitchedIP failed, no wrapper HDL found.")
+                raise Exception(
+                    """CreateStitchedIP failed, no wrapper HDL found under %s or %s.
+                    Please check logs under the parent directory."""
+                    % (wrapper_filename, wrapper_filename_alt)
+                )
 
         return (model, False)
diff --git a/src/finn/transformation/fpgadataflow/derive_characteristic.py b/src/finn/transformation/fpgadataflow/derive_characteristic.py
new file mode 100644
index 0000000000..dc660f5fba
--- /dev/null
+++ b/src/finn/transformation/fpgadataflow/derive_characteristic.py
@@ -0,0 +1,185 @@
+# Copyright (c) 2022, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+import qonnx.custom_op.registry as registry
+import warnings
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.transformation.base import NodeLocalTransformation
+
+from finn.util.fpgadataflow import is_fpgadataflow_node
+
+
+class DeriveCharacteristic(NodeLocalTransformation):
+    """For each node in the graph, run rtlsim to obtain the i/o
+    characteristic function for FIFO sizing and set the attribute.
+    It is assumed that the PrepareRTLSim transformation was already
+    called on the graph.
+
+    This transformation performs rtlsim for each node, so it will run for
+    some time (minutes to hours depending on configuration).
+
+    * period (int) desired period over which the characteristic function
+      will be derived.
+
+    * num_workers (int or None) number of parallel workers, see documentation in
+      NodeLocalTransformation for more details.
+    """
+
+    def __init__(self, period, num_workers=None, manual_bypass=False):
+        super().__init__(num_workers=num_workers)
+        self.period = period
+        self.manual_bypass = manual_bypass
+
+    def applyNodeLocal(self, node):
+        op_type = node.op_type
+        if is_fpgadataflow_node(node) is True:
+            try:
+                # lookup op_type in registry of CustomOps
+                inst = registry.getCustomOp(node)
+                inst.derive_characteristic_fxns(period=self.period)
+            except KeyError:
+                # exception if op_type is not supported
+                raise Exception("Custom op_type %s is currently not supported." % op_type)
+        return (node, False)
+
+    def apply(self, model: ModelWrapper):
+        (model, run_again) = super().apply(model)
+        if not self.manual_bypass:
+            return (model, run_again)
+        # apply manual fix for DuplicateStreams and AddStreams for
+        # simple residual reconvergent paths with bypass
+        addstrm_nodes = model.get_nodes_by_op_type("AddStreams_Batch")
+        for addstrm_node in addstrm_nodes:
+            # we currently only support the case where one branch is
+            # a bypass
+            b0 = model.find_producer(addstrm_node.input[0])
+            b1 = model.find_producer(addstrm_node.input[1])
+            if (b0 is None) or (b1 is None):
+                warnings.warn("Found unsupported AddStreams, skipping")
+                return (model, run_again)
+            b0_is_bypass = b0.op_type == "DuplicateStreams_Batch"
+            b1_is_bypass = b1.op_type == "DuplicateStreams_Batch"
+            if (not b0_is_bypass) and (not b1_is_bypass):
+                warnings.warn("Found unsupported AddStreams, skipping")
+                return (model, run_again)
+            ds_node = b0 if b0_is_bypass else b1
+            comp_branch_last = b1 if b0_is_bypass else b0
+
+            ds_comp_bout = ds_node.output[0] if b0_is_bypass else ds_node.output[1]
+            comp_branch_first = model.find_consumer(ds_comp_bout)
+            if comp_branch_first is None or comp_branch_last is None:
+                warnings.warn("Found unsupported DuplicateStreams, skipping")
+                return (model, run_again)
+            comp_branch_last = registry.getCustomOp(comp_branch_last)
+            comp_branch_first = registry.getCustomOp(comp_branch_first)
+            # for DuplicateStreams, use comp_branch_first's input characterization
+            # for AddStreams, use comp_branch_last's output characterization
+            period = comp_branch_first.get_nodeattr("io_chrc_period")
+            comp_branch_first_f = comp_branch_first.get_nodeattr("io_characteristic")[: 2 * period]
+            comp_branch_last_f = comp_branch_last.get_nodeattr("io_characteristic")[2 * period :]
+            ds_node_inst = registry.getCustomOp(ds_node)
+            addstrm_node_inst = registry.getCustomOp(addstrm_node)
+            ds_node_inst.set_nodeattr("io_chrc_period", period)
+            ds_node_inst.set_nodeattr("io_characteristic", comp_branch_first_f * 2)
+            addstrm_node_inst.set_nodeattr("io_chrc_period", period)
+            addstrm_node_inst.set_nodeattr("io_characteristic", comp_branch_last_f * 2)
+            warnings.warn(f"Set {ds_node.name} chrc. from {comp_branch_first.onnx_node.name}")
+            warnings.warn(f"Set {addstrm_node.name} chrc. from {comp_branch_last.onnx_node.name}")
+        return (model, run_again)
+
+
+class DeriveFIFOSizes(NodeLocalTransformation):
+    """Prerequisite: DeriveCharacteristic already called on graph.
+    For each node in the graph, use the accumulated I/O characteristic function
+    to perform FIFO sizing, setting the in/outFIFODepths attributes of HLSCustomOp
+    nodes.
+
+    * num_workers (int or None) number of parallel workers, see documentation in
+      NodeLocalTransformation for more details.
+    """
+
+    def __init__(self, num_workers=None, io_fifo_depth=32):
+        super().__init__(num_workers=num_workers)
+        self.io_fifo_depth = io_fifo_depth
+
+    def applyNodeLocal(self, node):
+        op_type = node.op_type
+        if is_fpgadataflow_node(node) is True:
+            try:
+                # lookup op_type in registry of CustomOps
+                prod = registry.getCustomOp(node)
+                assert op_type != "StreamingFIFO", "Found existing FIFOs"
+                period = prod.get_nodeattr("io_chrc_period")
+                prod_chrc = prod.get_nodeattr("io_chrc_out")[0]
+                assert len(prod_chrc) == 2 * period, "Found unexpected characterization attribute"
+                if any([x > 2 for x in prod.get_nodeattr("outFIFODepths")]):
+                    # FIFO depth already set, can skip this node
+                    return (node, False)
+
+                # find consumers
+                model = self.ref_input_model
+                out_fifo_depths = []
+                for output_name in node.output:
+                    cons_node = model.find_consumer(output_name)
+                    if cons_node is None:
+                        # could be final node, will be overridden if so
+                        # need an entry in the list anyway
+                        out_fifo_depths.append(self.io_fifo_depth)
+                        continue
+                    cons = registry.getCustomOp(cons_node)
+                    cons_chrc = cons.get_nodeattr("io_chrc_in")[0]
+                    # find minimum phase shift satisfying the constraint
+                    pshift_min = period - 1
+                    for pshift_cand in range(period):
+                        prod_chrc_part = prod_chrc[pshift_cand:period]
+                        cons_chrc_part = cons_chrc[: period - pshift_cand]
+                        if (prod_chrc_part >= cons_chrc_part).all():
+                            pshift_min = pshift_cand
+                            break
+                    prod_chrc_part = prod_chrc[pshift_min : (pshift_min + period)]
+                    cons_chrc_part = cons_chrc[:period]
+                    fifo_depth = int((prod_chrc_part - cons_chrc_part).max())
+                    out_fifo_depths.append(fifo_depth)
+                # set output FIFO depth for this (producing) node
+                # InsertFIFO looks at the max of (outFIFODepths, inFIFODepths)
+                # for each tensor
+                prod.set_nodeattr("outFIFODepths", out_fifo_depths)
+
+                # finally, check node inputs to ensure FIFOs are added to
+                # any top-level inputs (at least self.io_fifo_depth deep)
+                in_fifo_depths = prod.get_nodeattr("inFIFODepths")
+                for i, input_name in enumerate(node.input):
+                    if input_name in [x.name for x in model.graph.input]:
+                        in_fifo_depths[i] = max(self.io_fifo_depth, in_fifo_depths[i])
+                prod.set_nodeattr("inFIFODepths", in_fifo_depths)
+
+            except KeyError:
+                # exception if op_type is not supported
+                raise Exception("Custom op_type %s is currently not supported." % op_type)
+        return (node, False)
diff --git a/src/finn/transformation/fpgadataflow/externalize_params.py b/src/finn/transformation/fpgadataflow/externalize_params.py
index 732b82c675..633db0c553 100644
--- a/src/finn/transformation/fpgadataflow/externalize_params.py
+++ b/src/finn/transformation/fpgadataflow/externalize_params.py
@@ -64,11 +64,7 @@ def filter_fc_extw(x):
                 assert iodma_init is not None
                 # remove output-side initializer to get correct dataflow partitioning
                 model.graph.initializer.remove(
-                    [
-                        x
-                        for x in model.graph.initializer
-                        if x.name == extw_tensor_name_out
-                    ][0]
+                    [x for x in model.graph.initializer if x.name == extw_tensor_name_out][0]
                 )
                 graph_modified = True
 
diff --git a/src/finn/transformation/fpgadataflow/floorplan.py b/src/finn/transformation/fpgadataflow/floorplan.py
index 6792017223..d43aabcf55 100644
--- a/src/finn/transformation/fpgadataflow/floorplan.py
+++ b/src/finn/transformation/fpgadataflow/floorplan.py
@@ -56,7 +56,6 @@ def __init__(self, floorplan=None):
         self.user_floorplan = floorplan
 
     def apply(self, model):
-
         # read in a user-specified floorplan or generate a default one
         if self.user_floorplan is None:
             self.user_floorplan = model.analysis(floorplan_params)
@@ -129,9 +128,7 @@ def apply(self, model):
                 non_dma_nodes,
             )
         )
-        non_dma_nodes = list(
-            filter(lambda x: x not in dyn_tlastmarker_nodes, non_dma_nodes)
-        )
+        non_dma_nodes = list(filter(lambda x: x not in dyn_tlastmarker_nodes, non_dma_nodes))
 
         for node in dma_nodes:
             node_inst = getCustomOp(node)
@@ -151,6 +148,7 @@ def apply(self, model):
                 node_inst.set_nodeattr("partition_id", partition_cnt)
                 partition_cnt += 1
                 continue
+
             elif not (
                 node.op_type == "MatrixVectorActivation"
                 and node_inst.get_nodeattr("mem_mode") is not None
@@ -165,9 +163,15 @@ def apply(self, model):
                 pre_inst = getCustomOp(pre_node)
                 pre_slr = pre_inst.get_nodeattr("slr")
                 if node_slr == pre_slr:
-                    partition_id = pre_inst.get_nodeattr("partition_id")
-                    node_inst.set_nodeattr("partition_id", partition_id)
-                    break
+                    axilite_intf_name = pre_inst.get_verilog_top_module_intf_names()["axilite"]
+                    if len(axilite_intf_name) != 0:
+                        node_inst.set_nodeattr("partition_id", partition_cnt)
+                        partition_cnt += 1
+                    else:
+                        partition_id = pre_inst.get_nodeattr("partition_id")
+                        node_inst.set_nodeattr("partition_id", partition_id)
+                break
+
             else:
                 # no matching, new partition
                 node_inst.set_nodeattr("partition_id", partition_cnt)
diff --git a/src/finn/transformation/fpgadataflow/hlssynth_ip.py b/src/finn/transformation/fpgadataflow/hlssynth_ip.py
index 1fede06678..08069fa00f 100644
--- a/src/finn/transformation/fpgadataflow/hlssynth_ip.py
+++ b/src/finn/transformation/fpgadataflow/hlssynth_ip.py
@@ -64,7 +64,9 @@ def applyNodeLocal(self, node):
                 ), """Node
                 attribute "code_gen_dir_ipgen" is empty. Please run
                 transformation PrepareIP first."""
-                if not os.path.isdir(inst.get_nodeattr("ipgen_path")):
+                if not os.path.isdir(inst.get_nodeattr("ipgen_path")) or not inst.get_nodeattr(
+                    "code_gen_dir_ipgen"
+                ) in inst.get_nodeattr("ipgen_path"):
                     # call the compilation function for this node
                     inst.ipgen_singlenode_code()
                 else:
@@ -77,7 +79,5 @@ def applyNodeLocal(self, node):
                 is empty."""
             except KeyError:
                 # exception if op_type is not supported
-                raise Exception(
-                    "Custom op_type %s is currently not supported." % op_type
-                )
+                raise Exception("Custom op_type %s is currently not supported." % op_type)
         return (node, False)
diff --git a/src/finn/transformation/fpgadataflow/insert_dwc.py b/src/finn/transformation/fpgadataflow/insert_dwc.py
index 9817f2e3d2..140d154b1a 100644
--- a/src/finn/transformation/fpgadataflow/insert_dwc.py
+++ b/src/finn/transformation/fpgadataflow/insert_dwc.py
@@ -48,8 +48,7 @@ def apply(self, model):
                     if consumers == []:
                         continue
                     assert len(consumers) == 1, (
-                        n.name
-                        + ": HLS node with fan-out higher than 1 cannot be stitched"
+                        n.name + ": HLS node with fan-out higher than 1 cannot be stitched"
                     )
                     consumer = consumers[0]
                     if _suitable_node(consumer) is True:
@@ -81,6 +80,11 @@ def apply(self, model):
                             dwc_in_width = n0.get_outstream_width()
                             # determine dwc outwidth
                             dwc_out_width = n1.get_instream_width()
+                            # use hls mode by default since it supports more configs
+                            # vivado mode can be manually enabled by user, but does not
+                            # support e.g. node-by-node rtlsim neded for
+                            # characterization-based FIFO sizing
+                            impl_style = "hls"
 
                             # determine shape for dwc
                             dwc_shape = n0.get_normal_output_shape()
@@ -105,6 +109,7 @@ def apply(self, model):
                                 inWidth=dwc_in_width,
                                 outWidth=dwc_out_width,
                                 dataType=str(dtype.name),
+                                impl_style=impl_style,
                             )
                             # insert dwc
                             graph.node.insert(node_ind + 1, dwc_node)
diff --git a/src/finn/transformation/fpgadataflow/insert_fifo.py b/src/finn/transformation/fpgadataflow/insert_fifo.py
index 78200b2809..f57c9e41b7 100644
--- a/src/finn/transformation/fpgadataflow/insert_fifo.py
+++ b/src/finn/transformation/fpgadataflow/insert_fifo.py
@@ -67,19 +67,29 @@ class InsertFIFO(Transformation):
     between fpgadataflow nodes.
 
     Takes the setting for the depth from the surrounding nodes by extracting
-    node attribute 'outFIFODepth' of the previous and node attribute 'inFIFODepth'
+    node attribute 'outFIFODepths' of the previous and node attribute 'inFIFODepths'
     of the subsequent node. max() of these two values sets the FIFO depth.
 
-    Normally, shallow-depth (<=2) FIFOs won't be created since HLS streaming
-    interfaces already have a degree of buffering. You can set
-    create_shallow_fifos=True to override this default behavior.
+    Constructor arguments:
+
+    :parameter max_qsrl_depth: FIFOs deeper than this will use Vivado IP
+        instead of Verilog FIFOs (Q_srl.v)
+    :parameter vivado_ram_style: the StreamingFIFO.ram_style attribute
+        to be used for large FIFOs implemented by Vivado
+    :parameter create_shallow_fifos: Normally, shallow-depth (<=2) FIFOs
+        won't be created since HLS streaming interfaces
+        already have a degree of buffering.
+        Override with this parameter.
+
 
     The other node attributes necessary to create a FIFO node are taken from the
     node the FIFO node is inserted after: 'folded_shape' and 'dtype'"""
 
-    def __init__(self, create_shallow_fifos=False):
+    def __init__(self, create_shallow_fifos=False, max_qsrl_depth=None, vivado_ram_style="auto"):
         super().__init__()
         self.create_shallow_fifos = create_shallow_fifos
+        self.max_qsrl_depth = max_qsrl_depth
+        self.vivado_ram_style = vivado_ram_style
 
     def apply(self, model):
         graph = model.graph
@@ -88,8 +98,8 @@ def apply(self, model):
         for first_node in graph.node:
             node_ind += 1
             if _suitable_node(first_node):
-                for n_output in first_node.output:
-                    consumers = model.find_consumers(n_output)
+                for idx_out, output_name in enumerate(first_node.output):
+                    consumers = model.find_consumers(output_name)
                     if consumers == []:
                         continue
                     if len(consumers) > 1:
@@ -108,11 +118,9 @@ def apply(self, model):
                         # input of the second node is equal
                         n1 = getCustomOp(consumer)
                         for idx, inp in enumerate(consumer.input):
-                            if inp == n_output:
-                                if idx == 0:
-                                    fld_shape_2 = n1.get_folded_input_shape()
-                                else:
-                                    fld_shape_2 = n1.get_folded_input_shape(ind=idx)
+                            if inp == output_name:
+                                fld_shape_2 = n1.get_folded_input_shape(ind=idx)
+                                idx_inp = idx
                         assert _suitable_folded_shapes(
                             fld_shape, fld_shape_2
                         ), """The
@@ -120,14 +128,12 @@ def apply(self, model):
                         folded output shape of the second node. A streaming fifo can't
                         be implemented in between these nodes."""
 
-                        # check if outFIFOdepth attribute of first node
-                        # and inFIFOdepth attribute of consumer node is equal
-                        n0_depth = n0.get_nodeattr("outFIFODepth")
-                        n1_depth = n1.get_nodeattr("inFIFODepth")
-                        if n0_depth == n1_depth:
-                            fifo_depth = n0_depth
-                        elif n0_depth != n1_depth:
-                            fifo_depth = max(n0_depth, n1_depth)
+                        # check if outFIFOdepths attribute of first node
+                        # and inFIFOdepths attribute of consumer node is equal
+                        n0_depth = n0.get_nodeattr("outFIFODepths")[idx_out]
+                        n1_depth = n1.get_nodeattr("inFIFODepths")[idx_inp]
+
+                        fifo_depth = max(n0_depth, n1_depth)
 
                         if fifo_depth > 2 or self.create_shallow_fifos:
                             # assumption: HLS streaming components already have
@@ -143,25 +149,32 @@ def apply(self, model):
                             graph.value_info.append(fifo_output_tensor)
                             model.set_tensor_datatype(fifo_output_tensor.name, dtype)
 
+                            if self.max_qsrl_depth is None or fifo_depth <= self.max_qsrl_depth:
+                                impl_style = "rtl"
+                            else:
+                                impl_style = "vivado"
+
                             fifo_node = oh.make_node(
                                 "StreamingFIFO",
-                                [n_output],
+                                [output_name],
                                 [fifo_output_tensor.name],
                                 domain="finn.custom_op.fpgadataflow",
                                 backend="fpgadataflow",
                                 depth=fifo_depth,
                                 folded_shape=fld_shape,
                                 dataType=str(dtype.name),
+                                impl_style=impl_style,
+                                ram_style=self.vivado_ram_style,
                             )
                             # insert fifo
                             graph.node.insert(node_ind + 1, fifo_node)
                             # set fifo output tensor as new input tensor of second node
                             for idx, inp in enumerate(consumer.input):
-                                if inp == n_output:
+                                if inp == output_name:
                                     consumer.input[idx] = fifo_output_tensor.name
-                            # ensure created FIFO depth is reflected on both sides
-                            n0.set_nodeattr("outFIFODepth", fifo_depth)
-                            n1.set_nodeattr("inFIFODepth", fifo_depth)
+                            # removed setting of node attributes based on created
+                            # FIFO sizes here, better to preserve original attrs
+                            # as they are.
                             graph_modified = True
 
         if graph_modified is False:
@@ -169,96 +182,107 @@ def apply(self, model):
             for graph_in_name in graph_in_names:
                 first_node = model.find_consumer(graph_in_name)
                 # insert FIFO as first node, except when first node is DMA
-                if (
-                    first_node.op_type != "StreamingFIFO"
-                    and first_node.op_type != "IODMA"
-                ):
+                if first_node.op_type != "StreamingFIFO" and first_node.op_type != "IODMA":
                     inp_ind = list(first_node.input).index(graph_in_name)
                     n_input = first_node.input[inp_ind]
                     n0 = getCustomOp(first_node)
                     # determine fifo node attributes
-                    if inp_ind == 0:
-                        fld_shape = n0.get_folded_input_shape()
-                        dtype = n0.get_input_datatype()
+                    fld_shape = n0.get_folded_input_shape(inp_ind)
+                    dtype = n0.get_input_datatype(inp_ind)
+                    fifo_depth = n0.get_nodeattr("inFIFODepths")[inp_ind]
+
+                    if fifo_depth > 2 or self.create_shallow_fifos:
+                        # create fifo node
+                        fifo_output_tensor = oh.make_tensor_value_info(
+                            model.make_new_valueinfo_name(),
+                            TensorProto.FLOAT,
+                            n0.get_normal_input_shape(),
+                        )
+                        graph.value_info.append(fifo_output_tensor)
+                        model.set_tensor_datatype(fifo_output_tensor.name, dtype)
+
+                        # only use rtl-style FIFOs to avoid simulation bug
+                        # (top-level IOs should not have impl_style=vivado)
+                        impl_style = "rtl"
+
+                        fifo_node = oh.make_node(
+                            "StreamingFIFO",
+                            [n_input],
+                            [fifo_output_tensor.name],
+                            domain="finn.custom_op.fpgadataflow",
+                            backend="fpgadataflow",
+                            depth=fifo_depth,
+                            folded_shape=fld_shape,
+                            dataType=str(dtype.name),
+                            impl_style=impl_style,
+                            ram_style=self.vivado_ram_style,
+                        )
+                        # insert fifo
+                        graph.node.insert(0, fifo_node)
+
+                        # set fifo output tensor as new input tensor of second node
+                        first_node.input[inp_ind] = fifo_output_tensor.name
                     else:
-                        fld_shape = n0.get_folded_input_shape(inp_ind)
-                        dtype = n0.get_input_datatype(inp_ind)
-                    fifo_depth = n0.get_nodeattr("inFIFODepth")
-
-                    if fifo_depth <= 2:
-                        warnings.warn("Overriding input FIFO depth to 32")
-                        fifo_depth = 32
-
-                    # create fifo node
-                    fifo_output_tensor = oh.make_tensor_value_info(
-                        model.make_new_valueinfo_name(),
-                        TensorProto.FLOAT,
-                        n0.get_normal_input_shape(),
-                    )
-                    graph.value_info.append(fifo_output_tensor)
-                    model.set_tensor_datatype(fifo_output_tensor.name, dtype)
-
-                    fifo_node = oh.make_node(
-                        "StreamingFIFO",
-                        [n_input],
-                        [fifo_output_tensor.name],
-                        domain="finn.custom_op.fpgadataflow",
-                        backend="fpgadataflow",
-                        depth=fifo_depth,
-                        folded_shape=fld_shape,
-                        dataType=str(dtype.name),
-                    )
-                    # insert fifo
-                    graph.node.insert(0, fifo_node)
-
-                    # set fifo output tensor as new input tensor of second node
-                    first_node.input[inp_ind] = fifo_output_tensor.name
+                        warnings.warn(
+                            """Input FIFO for %s has depth %d and won't
+                        be created. This may cause RTL simulation issues.
+                        """
+                            % (graph_in_name, fifo_depth)
+                        )
 
             # insert FIFO as last node, except when last node is DMA
             graph_out_names = [x.name for x in model.graph.output]
             for graph_out_name in graph_out_names:
                 final_node = model.find_producer(graph_out_name)
-                if (
-                    final_node.op_type != "StreamingFIFO"
-                    and final_node.op_type != "IODMA"
-                ):
+                if final_node.op_type != "StreamingFIFO" and final_node.op_type != "IODMA":
                     assert (
                         final_node.op_type != "TLastMarker"
                     ), """Insert tlast marker should be done
                         after inserting the FIFOs"""
                     n0 = getCustomOp(final_node)
+                    out_ind = list(final_node.output).index(graph_out_name)
                     # determine fifo node attributes
-                    fld_shape = n0.get_folded_output_shape()
-                    dtype = n0.get_output_datatype()
-                    fifo_depth = n0.get_nodeattr("outFIFODepth")
-
-                    if fifo_depth <= 2:
-                        warnings.warn("Overriding output FIFO depth to 32")
-                        fifo_depth = 32
-
-                    # create fifo node
-                    fifo_input_tensor = oh.make_tensor_value_info(
-                        model.make_new_valueinfo_name(),
-                        TensorProto.FLOAT,
-                        n0.get_normal_output_shape(),
-                    )
-                    graph.value_info.append(fifo_input_tensor)
-                    model.set_tensor_datatype(fifo_input_tensor.name, dtype)
-
-                    fifo_node = oh.make_node(
-                        "StreamingFIFO",
-                        [fifo_input_tensor.name],
-                        [graph_out_name],
-                        domain="finn.custom_op.fpgadataflow",
-                        backend="fpgadataflow",
-                        depth=fifo_depth,
-                        folded_shape=fld_shape,
-                        dataType=str(dtype.name),
-                    )
-                    # insert fifo
-                    graph.node.append(fifo_node)
-
-                    # set fifo output tensor as new input tensor of second node
-                    final_node.output[0] = fifo_input_tensor.name
+                    fld_shape = n0.get_folded_output_shape(out_ind)
+                    dtype = n0.get_output_datatype(out_ind)
+                    fifo_depth = n0.get_nodeattr("outFIFODepths")[out_ind]
+
+                    if fifo_depth > 2 or self.create_shallow_fifos:
+                        # create fifo node
+                        fifo_input_tensor = oh.make_tensor_value_info(
+                            model.make_new_valueinfo_name(),
+                            TensorProto.FLOAT,
+                            n0.get_normal_output_shape(),
+                        )
+                        graph.value_info.append(fifo_input_tensor)
+                        model.set_tensor_datatype(fifo_input_tensor.name, dtype)
+
+                        # only use rtl-style FIFOs to avoid simulation bug
+                        # (top-level IOs should not have impl_style=vivado)
+                        impl_style = "rtl"
+
+                        fifo_node = oh.make_node(
+                            "StreamingFIFO",
+                            [fifo_input_tensor.name],
+                            [graph_out_name],
+                            domain="finn.custom_op.fpgadataflow",
+                            backend="fpgadataflow",
+                            depth=fifo_depth,
+                            folded_shape=fld_shape,
+                            dataType=str(dtype.name),
+                            impl_style=impl_style,
+                            ram_style=self.vivado_ram_style,
+                        )
+                        # insert fifo
+                        graph.node.append(fifo_node)
+
+                        # set fifo output tensor as new input tensor of second node
+                        final_node.output[0] = fifo_input_tensor.name
+                    else:
+                        warnings.warn(
+                            """Output FIFO for %s has depth %d and won't
+                        be created. This may cause RTL simulation issues.
+                        """
+                            % (graph_out_name, fifo_depth)
+                        )
 
         return (model, graph_modified)
diff --git a/src/finn/transformation/fpgadataflow/insert_hook.py b/src/finn/transformation/fpgadataflow/insert_hook.py
index 21ec3f049f..14989efa75 100644
--- a/src/finn/transformation/fpgadataflow/insert_hook.py
+++ b/src/finn/transformation/fpgadataflow/insert_hook.py
@@ -74,8 +74,7 @@ def apply(self, model):
                 for output_name in n.output:
                     consumers = model.find_consumers(output_name)
                     assert len(consumers) <= 1, (
-                        n.name
-                        + ": HLS node with fan-out higher than 1 cannot be stitched"
+                        n.name + ": HLS node with fan-out higher than 1 cannot be stitched"
                     )
                     n0 = getCustomOp(n)
                     n0_hook = n0.get_nodeattr("output_hook")
diff --git a/src/finn/transformation/fpgadataflow/insert_iodma.py b/src/finn/transformation/fpgadataflow/insert_iodma.py
index 4b4eb6362f..90700d5726 100644
--- a/src/finn/transformation/fpgadataflow/insert_iodma.py
+++ b/src/finn/transformation/fpgadataflow/insert_iodma.py
@@ -51,9 +51,7 @@ def __init__(
         self.insert_input = insert_input
         self.insert_output = insert_output
         self.insert_extmemw = insert_extmemw
-        assert (
-            2 ** math.log2(max_intfwidth) == max_intfwidth
-        ), "max_intfwidth must be a power of 2"
+        assert 2 ** math.log2(max_intfwidth) == max_intfwidth, "max_intfwidth must be a power of 2"
         self.max_intfwidth = max_intfwidth
 
     def get_mem_init(self, weights, pe, simd):
@@ -122,13 +120,9 @@ def apply(self, model):
                     padded_instream_width = first_node_inst.get_instream_width_padded()
                     padded_instream_bytes = padded_instream_width // 8
                     # determine the feasible interface width
-                    transfer_bits = padded_instream_width * np.prod(
-                        in_folded_shape[:-1]
-                    )
+                    transfer_bits = padded_instream_width * np.prod(in_folded_shape[:-1])
                     intfwidth = math.gcd(transfer_bits, self.max_intfwidth)
-                    assert (
-                        intfwidth % 8 == 0
-                    ), "No feasible interface width for transfer size"
+                    assert intfwidth % 8 == 0, "No feasible interface width for transfer size"
                     # make new buffer
                     first_node_in = oh.make_tensor_value_info(
                         model.make_new_valueinfo_name(), TensorProto.FLOAT, in_shape
@@ -169,18 +163,12 @@ def apply(self, model):
                     # take advantage of AXI stream width padding for DMA alignment
                     # (AXI streams are always padded to 8 bits)
                     # this is the width of stream input to DMA
-                    padded_outstream_width = (
-                        final_node_inst.get_outstream_width_padded()
-                    )
+                    padded_outstream_width = final_node_inst.get_outstream_width_padded()
                     padded_outstream_bytes = padded_outstream_width // 8
                     # determine the feasible interface width
-                    transfer_bits = padded_outstream_width * np.prod(
-                        out_folded_shape[:-1]
-                    )
+                    transfer_bits = padded_outstream_width * np.prod(out_folded_shape[:-1])
                     intfwidth = math.gcd(transfer_bits, self.max_intfwidth)
-                    assert (
-                        intfwidth % 8 == 0
-                    ), "No feasible interface width for transfer size"
+                    assert intfwidth % 8 == 0, "No feasible interface width for transfer size"
                     # make new buffer
                     final_node_out = oh.make_tensor_value_info(
                         model.make_new_valueinfo_name(), TensorProto.FLOAT, out_shape
@@ -211,7 +199,7 @@ def apply(self, model):
             # attached IODMA
             fc_extw_nodes = list(
                 filter(
-                    lambda x: x.op_type == "MatrixVectorActivation"
+                    lambda x: x.op_type in ["MatrixVectorActivation", "VectorVectorActivation"]
                     and getCustomOp(x).get_nodeattr("mem_mode") == "external"
                     and model.find_producer(x.input[1]) is None,
                     all_nodes,
@@ -225,9 +213,7 @@ def apply(self, model):
                 # determine the feasible interface width
                 transfer_bits = np.prod(w_shape) * w_dtype.bitwidth()
                 intfwidth = math.gcd(transfer_bits, self.max_intfwidth)
-                assert (
-                    intfwidth % 8 == 0
-                ), "No feasible interface width for transfer size"
+                assert intfwidth % 8 == 0, "No feasible interface width for transfer size"
                 # calculate width of stream output from DMA
                 pe = get_by_name(fc_node.attribute, "PE").i
                 simd = get_by_name(fc_node.attribute, "SIMD").i
@@ -259,6 +245,10 @@ def apply(self, model):
                 )
                 fc_node.input[1] = fc_node_in.name
                 model.graph.node.insert(0, dma_node)
+                # expand inFIFODepths for new second input of node
+                infifo_depth = fc_inst.get_nodeattr("inFIFODepths")
+                infifo_depth.append(8)
+                fc_inst.set_nodeattr("inFIFODepths", infifo_depth)
                 modified = True
         if modified:
             model = model.transform(SortGraph())
diff --git a/src/finn/transformation/fpgadataflow/insert_tlastmarker.py b/src/finn/transformation/fpgadataflow/insert_tlastmarker.py
index 1610916eb6..94f0b0eae1 100644
--- a/src/finn/transformation/fpgadataflow/insert_tlastmarker.py
+++ b/src/finn/transformation/fpgadataflow/insert_tlastmarker.py
@@ -54,10 +54,8 @@ def apply(self, model):
         graph_modified = False
         if final_node.op_type != "TLastMarker" and not (
             final_node.op_type == "IODMA"
-            and get_by_name(final_node.attribute, "direction").s.decode("UTF-8")
-            == "out"
+            and get_by_name(final_node.attribute, "direction").s.decode("UTF-8") == "out"
         ):
-
             custom_op = getCustomOp(final_node)
             num_iters = int(custom_op.get_number_output_values())
             stream_width = int(custom_op.get_outstream_width())
@@ -113,18 +111,13 @@ def apply(self, model):
                 # 2. node is either a TLastMarker or an input IODMA
                 if first_node.op_type != "TLastMarker" and not (
                     first_node.op_type == "IODMA"
-                    and get_by_name(first_node.attribute, "direction").s.decode("UTF-8")
-                    == "in"
+                    and get_by_name(first_node.attribute, "direction").s.decode("UTF-8") == "in"
                 ):
-
                     custom_op = getCustomOp(first_node)
                     num_iters = np.prod(custom_op.get_folded_input_shape()[1:-1])
                     inp_idx = list(first_node.input).index(graph_in_name)
                     if inp_idx > 0:
-                        if (
-                            first_node.op_type == "MatrixVectorActivation"
-                            and inp_idx == 1
-                        ):
+                        if first_node.op_type == "MatrixVectorActivation" and inp_idx == 1:
                             stream_width = int(custom_op.get_weightstream_width())
                         elif first_node.op_type == "AddStreams_Batch" and inp_idx == 1:
                             stream_width = int(custom_op.get_instream_width())
diff --git a/src/finn/transformation/fpgadataflow/make_deployment.py b/src/finn/transformation/fpgadataflow/make_deployment.py
deleted file mode 100644
index d4684dc83c..0000000000
--- a/src/finn/transformation/fpgadataflow/make_deployment.py
+++ /dev/null
@@ -1,116 +0,0 @@
-# Copyright (c) 2020, Xilinx
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice, this
-#   list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the following disclaimer in the documentation
-#   and/or other materials provided with the distribution.
-#
-# * Neither the name of FINN nor the names of its
-#   contributors may be used to endorse or promote products derived from
-#   this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import os
-import subprocess
-from distutils.dir_util import copy_tree
-from qonnx.transformation.base import Transformation
-from shutil import copy
-
-import finn.transformation.fpgadataflow.templates as templates
-from finn.util.basic import make_build_dir
-
-
-class DeployToPYNQ(Transformation):
-    """Collects all necessary files for deployment and copies them to the PYNQ board.
-    Expects information about PYNQ board to make scp possible:
-
-    IP address of board, username and password for board and target directory where
-    the files are stored on the board"""
-
-    def __init__(self, ip, port, username, password, target_dir):
-        super().__init__()
-        self.ip = ip
-        self.port = port
-        self.username = username
-        self.password = password
-        self.target_dir = target_dir
-
-    def apply(self, model):
-        # set metadata properties accordingly to user input specifications
-        model.set_metadata_prop("pynq_ip", self.ip)
-        model.set_metadata_prop("pynq_port", str(self.port))
-        model.set_metadata_prop("pynq_username", self.username)
-        model.set_metadata_prop("pynq_password", self.password)
-        model.set_metadata_prop("pynq_target_dir", self.target_dir)
-
-        # create directory for deployment files
-        deployment_dir = make_build_dir(prefix="pynq_deployment_")
-        model.set_metadata_prop("pynq_deployment_dir", deployment_dir)
-
-        # get and copy necessary files
-        # .bit and .hwh file
-        bitfile = model.get_metadata_prop("bitfile")
-        hwh_file = model.get_metadata_prop("hw_handoff")
-        deploy_files = [bitfile, hwh_file]
-
-        for dfile in deploy_files:
-            if dfile is not None:
-                copy(dfile, deployment_dir)
-
-        # helper script for Alveo
-        platform = model.get_metadata_prop("platform")
-        if platform == "alveo":
-            alveo_run_sh = templates.alveo_run_sh_template
-            fill_dict = {
-                "$REMOTE_DEPLOY_DIR$": self.target_dir
-                + "/"
-                + os.path.basename(deployment_dir),
-                "$CONDA_ENV_NAME$": "finn-pynq-alveo",
-                "$REMOTE_XRT$": os.environ["XILINX_XRT"],
-                "$REMOTE_PLATFORM_REPO_PATHS$": os.environ["PLATFORM_REPO_PATHS"],
-                "$BITFILE$": os.path.basename(bitfile),
-            }
-            for key, value in fill_dict.items():
-                alveo_run_sh = alveo_run_sh.replace(key, value)
-            alveo_run_sh_path = deployment_dir + "/alveo_run.sh"
-            with open(alveo_run_sh_path, "w") as f:
-                f.write(alveo_run_sh)
-
-        # driver.py and python libraries
-        pynq_driver_dir = model.get_metadata_prop("pynq_driver_dir")
-        copy_tree(pynq_driver_dir, deployment_dir)
-        model.set_metadata_prop("pynq_deploy_dir", deployment_dir)
-        model.set_metadata_prop("exec_mode", "remote_pynq")
-
-        # create target directory on PYNQ board
-        cmd = 'ssh {}@{} -p {} "mkdir -p {}"'.format(
-            self.username, self.ip, self.port, self.target_dir
-        )
-        bash_command = ["/bin/bash", "-c", cmd]
-        process_compile = subprocess.Popen(bash_command, stdout=subprocess.PIPE)
-        process_compile.communicate()
-        # copy directory to PYNQ board using scp
-        cmd = "scp -P{} -r {} {}@{}:{}".format(
-            self.port, deployment_dir, self.username, self.ip, self.target_dir
-        )
-        bash_command = ["/bin/bash", "-c", cmd]
-        process_compile = subprocess.Popen(bash_command, stdout=subprocess.PIPE)
-        process_compile.communicate()
-
-        return (model, False)
diff --git a/src/finn/transformation/fpgadataflow/make_pynq_driver.py b/src/finn/transformation/fpgadataflow/make_pynq_driver.py
index dce98e54a3..6d1fa290b4 100644
--- a/src/finn/transformation/fpgadataflow/make_pynq_driver.py
+++ b/src/finn/transformation/fpgadataflow/make_pynq_driver.py
@@ -26,9 +26,6 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-
-import pkg_resources as pk
-
 import numpy as np
 import os
 import qonnx
@@ -56,14 +53,10 @@ def to_external_tensor(init, w_dtype):
 
     weight_width = init.shape[1] * w_dtype.bitwidth()
     weight_width_padded = roundup_to_integer_multiple(weight_width, 4)
-    hex_init = pack_innermost_dim_as_hex_string(
-        init, w_dtype, weight_width_padded, prefix="0x"
-    )
+    hex_init = pack_innermost_dim_as_hex_string(init, w_dtype, weight_width_padded, prefix="0x")
     ext_weight = np.array([], dtype=np.uint8)
     for line in hex_init:
-        array_line = [
-            x for x in reversed(hexstring2npbytearray(line, remove_prefix="0x"))
-        ]
+        array_line = [x for x in reversed(hexstring2npbytearray(line, remove_prefix="0x"))]
         ext_weight = np.append(ext_weight, array_line)
 
     return ext_weight
@@ -88,14 +81,13 @@ def __init__(self, platform):
         self.platform = platform
 
     def apply(self, model):
-
         # create a temporary folder for the generated driver
         pynq_driver_dir = make_build_dir(prefix="pynq_driver_")
         model.set_metadata_prop("pynq_driver_dir", pynq_driver_dir)
 
         # create the base FINN driver -- same for all accels
-        driver_base_template = pk.resource_filename(
-            "finn.qnn-data", "templates/driver/driver_base.py"
+        driver_base_template = (
+            os.environ["FINN_ROOT"] + "/src/finn/qnn-data/templates/driver/driver_base.py"
         )
         driver_base_py = pynq_driver_dir + "/driver_base.py"
         shutil.copy(driver_base_template, driver_base_py)
@@ -115,9 +107,7 @@ def apply(self, model):
         files_to_copy.append(
             (qonnx_path + "/core/__init__.py", qonnx_target_path + "/core/__init__.py")
         )
-        files_to_copy.append(
-            (qonnx_path + "/util/basic.py", qonnx_target_path + "/util/basic.py")
-        )
+        files_to_copy.append((qonnx_path + "/util/basic.py", qonnx_target_path + "/util/basic.py"))
         files_to_copy.append(
             (qonnx_path + "/util/__init__.py", qonnx_target_path + "/util/__init__.py")
         )
@@ -133,7 +123,7 @@ def apply(self, model):
                 finn_target_path + "/util/__init__.py",
             )
         )
-        for (src_file, target_file) in files_to_copy:
+        for src_file, target_file in files_to_copy:
             shutil.copy(src_file, target_file)
         # extract input-output shapes from the graph
         # TODO convert this to an analysis pass?
@@ -165,13 +155,9 @@ def apply(self, model):
             first_node = successor_df_model.find_consumer(
                 successor_df_model.graph.input[successor_input_num].name
             )
-            i_tensor_shape_folded = tuple(
-                getCustomOp(first_node).get_folded_input_shape()
-            )
+            i_tensor_shape_folded = tuple(getCustomOp(first_node).get_folded_input_shape())
             # generate dummy folded i/o tensors and their packed versions
-            i_tensor_dummy_folded = gen_finn_dt_tensor(
-                i_tensor_dt, i_tensor_shape_folded
-            )
+            i_tensor_dummy_folded = gen_finn_dt_tensor(i_tensor_dt, i_tensor_shape_folded)
             i_tensor_dummy_packed = dpk.finnpy_to_packed_bytearray(
                 i_tensor_dummy_folded, i_tensor_dt
             )
@@ -201,24 +187,16 @@ def apply(self, model):
             ), """
                 Ensure CreateDataflowPartition called before driver creation."""
             df_model = ModelWrapper(getCustomOp(o_producer).get_nodeattr("model"))
-            assert (
-                df_model.graph.node[-1].op_type == "IODMA"
-            ), "Partition must hold output IODMA"
+            assert df_model.graph.node[-1].op_type == "IODMA", "Partition must hold output IODMA"
             predecessors = model.find_direct_predecessors(o_producer)
-            predecessor_output_num = list(predecessors[0].output).index(
-                o_producer.input[0]
-            )
+            predecessor_output_num = list(predecessors[0].output).index(o_producer.input[0])
             predecessor_sdp = getCustomOp(predecessors[0])
             predecessor_df_model = ModelWrapper(predecessor_sdp.get_nodeattr("model"))
             last_node = predecessor_df_model.find_producer(
                 predecessor_df_model.graph.output[predecessor_output_num].name
             )
-            o_tensor_shape_folded = tuple(
-                getCustomOp(last_node).get_folded_output_shape()
-            )
-            o_tensor_dummy_folded = gen_finn_dt_tensor(
-                o_tensor_dt, o_tensor_shape_folded
-            )
+            o_tensor_shape_folded = tuple(getCustomOp(last_node).get_folded_output_shape())
+            o_tensor_dummy_folded = gen_finn_dt_tensor(o_tensor_dt, o_tensor_shape_folded)
             o_tensor_dummy_packed = dpk.finnpy_to_packed_bytearray(
                 o_tensor_dummy_folded, o_tensor_dt
             )
@@ -256,17 +234,11 @@ def apply(self, model):
                 assert df_model.graph.node[0].op_type == "IODMA"
                 iodma_node = getCustomOp(df_model.graph.node[0])
                 if iodma_node.get_nodeattr("burstMode") == "wrap":  # input weights dma?
-                    init_tensor = df_model.get_initializer(
-                        iodma_node.onnx_node.input[0]
-                    )
+                    init_tensor = df_model.get_initializer(iodma_node.onnx_node.input[0])
                     ext_weight_dma_cnt += 1
-                    w_dtype = df_model.get_tensor_datatype(
-                        iodma_node.onnx_node.input[0]
-                    )
+                    w_dtype = df_model.get_tensor_datatype(iodma_node.onnx_node.input[0])
                     init_external_tensor = to_external_tensor(init_tensor, w_dtype)
-                    np.save(
-                        weights_dir + "/" + idma_name + ".npy", init_external_tensor
-                    )
+                    np.save(weights_dir + "/" + idma_name + ".npy", init_external_tensor)
                 idma_idx += 1
 
         # fill in the driver template
@@ -293,8 +265,8 @@ def apply(self, model):
 
         # add validate.py to run full top-1 test (only for suitable networks)
         validate_py = pynq_driver_dir + "/validate.py"
-        validate_template = pk.resource_filename(
-            "finn.qnn-data", "templates/driver/validate.py"
+        validate_template = (
+            os.environ["FINN_ROOT"] + "/src/finn/qnn-data/templates/driver/validate.py"
         )
         shutil.copy(validate_template, validate_py)
 
@@ -318,9 +290,7 @@ def apply(self, model):
                             rt_layer_ind,
                             node.name,
                         )
-                        node_inst.make_weight_file(
-                            fcl_w, "decoupled_runtime", w_filename
-                        )
+                        node_inst.make_weight_file(fcl_w, "decoupled_runtime", w_filename)
                         rt_layer_ind += 1
                 elif node.op_type == "StreamingDataflowPartition":
                     warnings.warn(
diff --git a/src/finn/transformation/fpgadataflow/make_zynq_proj.py b/src/finn/transformation/fpgadataflow/make_zynq_proj.py
index a589cb039c..989eb62a88 100644
--- a/src/finn/transformation/fpgadataflow/make_zynq_proj.py
+++ b/src/finn/transformation/fpgadataflow/make_zynq_proj.py
@@ -45,7 +45,7 @@
 from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
 from finn.transformation.fpgadataflow.insert_iodma import InsertIODMA
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
-from finn.util.basic import make_build_dir, pynq_part_map
+from finn.util.basic import make_build_dir, pynq_native_port_width, pynq_part_map
 
 from . import templates
 
@@ -92,7 +92,6 @@ def __init__(self, platform, enable_debug=False):
         self.enable_debug = 1 if enable_debug else 0
 
     def apply(self, model):
-
         # create a config file and empty list of xo files
         config = []
         idma_idx = 0
@@ -110,15 +109,12 @@ def apply(self, model):
             ipstitch_path = kernel_model.get_metadata_prop("vivado_stitch_proj")
             if ipstitch_path is None or (not os.path.isdir(ipstitch_path)):
                 raise Exception(
-                    "No stitched IPI design found for %s, apply CreateStitchedIP first."
-                    % node.name
+                    "No stitched IPI design found for %s, apply CreateStitchedIP first." % node.name
                 )
 
             vivado_stitch_vlnv = kernel_model.get_metadata_prop("vivado_stitch_vlnv")
             if vivado_stitch_vlnv is None:
-                raise Exception(
-                    "No vlnv found for %s, apply CreateStitchedIP first." % node.name
-                )
+                raise Exception("No vlnv found for %s, apply CreateStitchedIP first." % node.name)
 
             ip_dirs = ["list"]
             ip_dirs += collect_ip_dirs(kernel_model, ipstitch_path)
@@ -170,9 +166,7 @@ def apply(self, model):
                     "[get_bd_intf_pins smartconnect_0/S%02d_AXI]"
                     % (instance_names[node.name], aximm_idx)
                 )
-                assert (
-                    len(ifnames["axilite"]) == 1
-                ), "Must have 1 AXI lite interface on IODMA nodes"
+                assert len(ifnames["axilite"]) == 1, "Must have 1 AXI lite interface on IODMA nodes"
                 axilite_intf_name = ifnames["axilite"][0]
                 assert axilite_intf_name is not None
                 config.append(
@@ -182,8 +176,7 @@ def apply(self, model):
                 )
                 # assign_bd_address with appropriate range/offset
                 config.append(
-                    "assign_axi_addr_proc %s/%s"
-                    % (instance_names[node.name], axilite_intf_name)
+                    "assign_axi_addr_proc %s/%s" % (instance_names[node.name], axilite_intf_name)
                 )
 
                 aximm_idx += 1
@@ -269,23 +262,18 @@ def apply(self, model):
         bash_command = ["bash", synth_project_sh]
         process_compile = subprocess.Popen(bash_command, stdout=subprocess.PIPE)
         process_compile.communicate()
-        bitfile_name = (
-            vivado_pynq_proj_dir + "/finn_zynq_link.runs/impl_1/top_wrapper.bit"
-        )
+        bitfile_name = vivado_pynq_proj_dir + "/finn_zynq_link.runs/impl_1/top_wrapper.bit"
         if not os.path.isfile(bitfile_name):
             raise Exception(
-                "Synthesis failed, no bitfile found. Check logs under %s"
-                % vivado_pynq_proj_dir
+                "Synthesis failed, no bitfile found. Check logs under %s" % vivado_pynq_proj_dir
             )
         deploy_bitfile_name = vivado_pynq_proj_dir + "/resizer.bit"
         copy(bitfile_name, deploy_bitfile_name)
         # set bitfile attribute
         model.set_metadata_prop("bitfile", deploy_bitfile_name)
         hwh_name_alts = [
-            vivado_pynq_proj_dir
-            + "/finn_zynq_link.srcs/sources_1/bd/top/hw_handoff/top.hwh",
-            vivado_pynq_proj_dir
-            + "/finn_zynq_link.gen/sources_1/bd/top/hw_handoff/top.hwh",
+            vivado_pynq_proj_dir + "/finn_zynq_link.srcs/sources_1/bd/top/hw_handoff/top.hwh",
+            vivado_pynq_proj_dir + "/finn_zynq_link.gen/sources_1/bd/top/hw_handoff/top.hwh",
         ]
         hwh_name = None
         for hwh_name_cand in hwh_name_alts:
@@ -293,8 +281,7 @@ def apply(self, model):
                 hwh_name = hwh_name_cand
         if not os.path.isfile(hwh_name):
             raise Exception(
-                "Synthesis failed, no bitfile found. Check logs under %s"
-                % vivado_pynq_proj_dir
+                "Synthesis failed, no bitfile found. Check logs under %s" % vivado_pynq_proj_dir
             )
         deploy_hwh_name = vivado_pynq_proj_dir + "/resizer.hwh"
         copy(hwh_name, deploy_hwh_name)
@@ -320,6 +307,7 @@ def __init__(
     ):
         super().__init__()
         self.fpga_part = pynq_part_map[platform]
+        self.axi_port_width = pynq_native_port_width[platform]
         self.period_ns = period_ns
         self.platform = platform
         self.enable_debug = enable_debug
@@ -330,7 +318,7 @@ def apply(self, model):
         model = model.transform(InferDataLayouts())
         # prepare at global level, then break up into kernels
         prep_transforms = [
-            InsertIODMA(64),
+            InsertIODMA(self.axi_port_width),
             InsertDWC(),
             Floorplan(),
             CreateDataflowPartition(partition_model_dir=self.partition_model_dir),
@@ -349,21 +337,15 @@ def apply(self, model):
             kernel_model = kernel_model.transform(InsertFIFO())
             kernel_model = kernel_model.transform(GiveUniqueNodeNames(prefix))
             kernel_model.save(dataflow_model_filename)
-            kernel_model = kernel_model.transform(
-                PrepareIP(self.fpga_part, self.period_ns)
-            )
+            kernel_model = kernel_model.transform(PrepareIP(self.fpga_part, self.period_ns))
             kernel_model = kernel_model.transform(HLSSynthIP())
             kernel_model = kernel_model.transform(
-                CreateStitchedIP(
-                    self.fpga_part, self.period_ns, sdp_node.onnx_node.name, False
-                )
+                CreateStitchedIP(self.fpga_part, self.period_ns, sdp_node.onnx_node.name, False)
             )
             kernel_model.set_metadata_prop("platform", "zynq-iodma")
             kernel_model.save(dataflow_model_filename)
         # Assemble design from IPs
-        model = model.transform(
-            MakeZYNQProject(self.platform, enable_debug=self.enable_debug)
-        )
+        model = model.transform(MakeZYNQProject(self.platform, enable_debug=self.enable_debug))
 
         # set platform attribute for correct remote execution
         model.set_metadata_prop("platform", "zynq-iodma")
diff --git a/src/finn/transformation/fpgadataflow/minimize_accumulator_width.py b/src/finn/transformation/fpgadataflow/minimize_accumulator_width.py
index bc020ca428..8d04d5b817 100644
--- a/src/finn/transformation/fpgadataflow/minimize_accumulator_width.py
+++ b/src/finn/transformation/fpgadataflow/minimize_accumulator_width.py
@@ -28,6 +28,7 @@
 
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.base import Transformation
+from qonnx.transformation.infer_datatypes import InferDataTypes
 
 from finn.util.fpgadataflow import is_fpgadataflow_node
 
@@ -41,9 +42,15 @@ def __init__(self):
         super().__init__()
 
     def apply(self, model):
-        for node in model.graph.node:
+        for node_id in range(len(model.graph.node)):
+            # Since InferDataTypes potentially changes node attributes in each loop iterations,
+            # the for-loop cannot loop over a list of a snapshot of the graph's node protos
+            node = model.graph.node[node_id]
             if is_fpgadataflow_node(node) is True:
                 inst = getCustomOp(node)
                 if hasattr(inst, "minimize_accumulator_width"):
                     inst.minimize_accumulator_width(model)
+                    # Since this transformation is applied iteratively, we have to ensure that
+                    # we propagate the new datatype to other layers
+                    model = model.transform(InferDataTypes())
         return (model, False)
diff --git a/tests/end2end/test_end2end_access_board.py b/src/finn/transformation/fpgadataflow/minimize_weight_bit_width.py
similarity index 64%
rename from tests/end2end/test_end2end_access_board.py
rename to src/finn/transformation/fpgadataflow/minimize_weight_bit_width.py
index ba3c49195b..32871cc44a 100644
--- a/tests/end2end/test_end2end_access_board.py
+++ b/src/finn/transformation/fpgadataflow/minimize_weight_bit_width.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, Xilinx
+# Copyright (C) 2023, Advanced Micro Devices, Inc.
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -26,31 +26,24 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import pytest
+from qonnx.custom_op.registry import getCustomOp
+from qonnx.transformation.base import Transformation
 
-import subprocess
+from finn.util.fpgadataflow import is_fpgadataflow_node
 
-from finn.util.test import get_build_env
 
+class MinimizeWeightBitWidth(Transformation):
+    """For relevant nodes, call the weight bit width minimization
+    functions to save on resources. May alter tensor weightDataType
+    if the node does not have runtime writeable weights."""
 
-@pytest.mark.board
-@pytest.mark.end2end
-def test_end2end_access_board():
-    build_env = get_build_env("zynq", 5)
-    if build_env["ip"] == "":
-        pytest.skip("PYNQ board IP address not specified")
-    remote_cmd_base = [
-        "ssh",
-        "-o",
-        "PreferredAuthentications=publickey",
-        "-o",
-        "PasswordAuthentication=no",
-        "%s@%s" % (build_env["username"], build_env["ip"]),
-    ]
-    test_text = "BoardIsAccessible"
-    touch_cmd = remote_cmd_base + ["echo %s" % test_text]
-    verif_res = subprocess.run(
-        touch_cmd, stdout=subprocess.PIPE, universal_newlines=True
-    )
-    assert verif_res.returncode == 0
-    assert verif_res.stdout.split("\n")[0] == test_text
+    def __init__(self):
+        super().__init__()
+
+    def apply(self, model):
+        for node in model.graph.node:
+            if is_fpgadataflow_node(node) is True:
+                inst = getCustomOp(node)
+                if hasattr(inst, "minimize_weight_bit_width"):
+                    inst.minimize_weight_bit_width(model)
+        return (model, False)
diff --git a/src/finn/transformation/fpgadataflow/prepare_cppsim.py b/src/finn/transformation/fpgadataflow/prepare_cppsim.py
index 07021c1e8d..76c3f88310 100644
--- a/src/finn/transformation/fpgadataflow/prepare_cppsim.py
+++ b/src/finn/transformation/fpgadataflow/prepare_cppsim.py
@@ -49,9 +49,7 @@ def _codegen_single_node(node, model):
         code_gen_dir = inst.get_nodeattr("code_gen_dir_cppsim")
         # ensure that there is a directory
         if code_gen_dir == "" or not os.path.isdir(code_gen_dir):
-            code_gen_dir = make_build_dir(
-                prefix="code_gen_cppsim_" + str(node.name) + "_"
-            )
+            code_gen_dir = make_build_dir(prefix="code_gen_cppsim_" + str(node.name) + "_")
             inst.set_nodeattr("code_gen_dir_cppsim", code_gen_dir)
         # ensure that there is generated code inside the dir
         inst.code_generation_cppsim(model)
diff --git a/src/finn/transformation/fpgadataflow/prepare_ip.py b/src/finn/transformation/fpgadataflow/prepare_ip.py
index 2ebd6310f0..5461bbd77c 100644
--- a/src/finn/transformation/fpgadataflow/prepare_ip.py
+++ b/src/finn/transformation/fpgadataflow/prepare_ip.py
@@ -47,9 +47,7 @@ def _codegen_single_node(node, model, fpgapart, clk):
         code_gen_dir = inst.get_nodeattr("code_gen_dir_ipgen")
         # ensure that there is a directory
         if code_gen_dir == "" or not os.path.isdir(code_gen_dir):
-            code_gen_dir = make_build_dir(
-                prefix="code_gen_ipgen_" + str(node.name) + "_"
-            )
+            code_gen_dir = make_build_dir(prefix="code_gen_ipgen_" + str(node.name) + "_")
             inst.set_nodeattr("code_gen_dir_ipgen", code_gen_dir)
             # ensure that there is generated code inside the dir
             inst.code_generation_ipgen(model, fpgapart, clk)
diff --git a/src/finn/transformation/fpgadataflow/prepare_rtlsim.py b/src/finn/transformation/fpgadataflow/prepare_rtlsim.py
index 645d86cf14..8ba7cfd965 100644
--- a/src/finn/transformation/fpgadataflow/prepare_rtlsim.py
+++ b/src/finn/transformation/fpgadataflow/prepare_rtlsim.py
@@ -74,7 +74,5 @@ def applyNodeLocal(self, node):
                 ), "Failed to prepare RTLSim, no rtlsim_so attribute found."
             except KeyError:
                 # exception if op_type is not supported
-                raise Exception(
-                    "Custom op_type %s is currently not supported." % op_type
-                )
+                raise Exception("Custom op_type %s is currently not supported." % op_type)
         return (node, False)
diff --git a/src/finn/transformation/fpgadataflow/set_exec_mode.py b/src/finn/transformation/fpgadataflow/set_exec_mode.py
index a08d153cb2..8488b4ef83 100644
--- a/src/finn/transformation/fpgadataflow/set_exec_mode.py
+++ b/src/finn/transformation/fpgadataflow/set_exec_mode.py
@@ -56,7 +56,5 @@ def apply(self, model):
                         was not successful. Node attribute "exec_mode" is not set"""
                 except KeyError:
                     # exception if op_type is not supported
-                    raise Exception(
-                        "Custom op_type %s is currently not supported." % op_type
-                    )
+                    raise Exception("Custom op_type %s is currently not supported." % op_type)
         return (model, False)
diff --git a/src/finn/transformation/fpgadataflow/set_fifo_depths.py b/src/finn/transformation/fpgadataflow/set_fifo_depths.py
index 0139c71666..da6099ab9a 100644
--- a/src/finn/transformation/fpgadataflow/set_fifo_depths.py
+++ b/src/finn/transformation/fpgadataflow/set_fifo_depths.py
@@ -29,10 +29,16 @@
 import math
 import numpy as np
 import warnings
+from onnx import TensorProto, helper
 from pyverilator.util.axi_utils import reset_rtlsim, toggle_clk
+from qonnx.core.datatype import DataType
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.base import Transformation
-from qonnx.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
+from qonnx.transformation.general import (
+    GiveReadableTensorNames,
+    GiveUniqueNodeNames,
+    SortGraph,
+)
 
 from finn.analysis.fpgadataflow.dataflow_performance import dataflow_performance
 from finn.transformation.fpgadataflow.annotate_cycles import AnnotateCycles
@@ -42,7 +48,7 @@
 from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
 from finn.util.fpgadataflow import is_fpgadataflow_node
-from finn.util.pyverilator import pyverilate_stitched_ip
+from finn.util.pyverilator import pyverilate_stitched_ip, verilator_fifosim
 
 
 def reset_implementation(node):
@@ -72,8 +78,9 @@ def optimize_depth(depth):
         # Q_srl FIFOs do not benefit from size < 32
         # add some slack
         return 32
-    # round to nearest power of two for Vivado IP FIFO implementation
-    return int(2 ** math.ceil(math.log2(depth)))
+    # otherwise leave as is
+    # will be rounded to nearest power of two for Vivado-style FIFO
+    return int(depth)
 
 
 class RemoveShallowFIFOs(Transformation):
@@ -125,14 +132,17 @@ class CapConvolutionFIFODepths(Transformation):
     constructor flag is set.
 
     Constructor arguments:
-    - max_qsrl_depth : FIFOs deeper than this will use Vivado IP instead of
-                       Verilog FIFOs (Q_srl.v)
+
+    :parameter max_qsrl_depth: FIFOs deeper than this will use Vivado IP
+        instead of Verilog FIFOs (Q_srl.v)
 
     Assumed input graph properties:
+
     - all nodes are fpgadataflow nodes
     - FIFOs inserted with InsertAndSetFIFODepths
 
     Output:
+
     - graph with smaller-depth FIFOs for convolutions
 
     Background:
@@ -188,21 +198,25 @@ class InsertAndSetFIFODepths(Transformation):
     throughput in the created accelerator.
 
     Constructor arguments:
-    - clk_ns : clock period (used for IP preparation)
-    - max_qsrl_depth : FIFOs deeper than this will use Vivado IP instead of
-                       Verilog FIFOs (Q_srl.v)
-    - max_depth : how deep the "max"-sized FIFOs initially inserted will be
-    - swg_exception : call CapConvolutionFIFODepths to make convolution FIFOs
-                        smaller where appropriate
-    - vivado_ram_style : the StreamingFIFO.ram_style attribute to be used for
-                          large FIFOs implemented by Vivado
+
+    :parameter clk_ns: clock period (used for IP preparation)
+    :parameter max_qsrl_depth: FIFOs deeper than this will use Vivado IP
+        instead of Verilog FIFOs (Q_srl.v)
+    :parameter max_depth: how deep the "max"-sized FIFOs initially inserted
+        will be. If set to None, use the tensor size as the depth
+    :parameter swg_exception: call CapConvolutionFIFODepths to make convolution FIFOs
+        smaller where appropriate
+    :parameter vivado_ram_style: the StreamingFIFO.ram_style attribute to be used
+        for large FIFOs implemented by Vivado afterwards
 
     Assumed input graph properties:
+
     - all nodes are fpgadataflow nodes
     - no FIFOs inserted,
-    - (inFIFODepth/outFIFODepth attrs will be ignored)
+    - (inFIFODepths/outFIFODepths attrs will be ignored)
 
     Output:
+
     - graph with appropriate-depth FIFOs inserted
 
     Background:
@@ -210,12 +224,14 @@ class InsertAndSetFIFODepths(Transformation):
     necessary to insert FIFOs between them to prevent stalls due to bursty
     behavior. The sizes of those FIFOs are hard to predict analytically, so
     we do the following:
-    - insert very deep (default 16k deep) FIFOs between all fpgadataflow nodes
+
+    - insert deep (=tensor size) FIFOs between all fpgadataflow nodes
     - create stitched design
     - run through rtlsim with stream of multiple random input images (to fill pipeline)
     - keep track of observed maximum occupancy for each FIFO during rtlsim
     - when sim finished, update each FIFO depth to maximum observed occupancy
-      and set inFIFODepth/outFIFODepth attrs to 0 on relevant nodes
+      and set inFIFODepths/outFIFODepths attrs to that depth as well
+
     """
 
     def __init__(
@@ -223,9 +239,10 @@ def __init__(
         fpgapart,
         clk_ns=10.0,
         max_qsrl_depth=256,
-        max_depth=2**14,
+        max_depth=None,
         swg_exception=True,
         vivado_ram_style="auto",
+        force_python_sim=False,
     ):
         super().__init__()
         self.fpgapart = fpgapart
@@ -234,46 +251,62 @@ def __init__(
         self.max_depth = max_depth
         self.swg_exception = swg_exception
         self.vivado_ram_style = vivado_ram_style
+        self.force_python_sim = force_python_sim
 
     def apply(self, model):
+        # these optypes may potentially use external weights
+        # we'll temporarily change them to use decoupled mode for FIFO sizing
+        extw_optypes = ["MatrixVectorActivation", "VectorVectorActivation"]
         # change external to decoupled and warn user
         # this way we are sure we have exactly one input/output
         modified_fc_nodes = []
         for node in model.graph.node:
             # verify assumptions
-            assert is_fpgadataflow_node(node), "Found non-fpgadataflow node: " + str(
-                node
-            )
+            assert is_fpgadataflow_node(node), "Found non-fpgadataflow node: " + str(node)
             assert node.op_type != "StreamingFIFO", "Found existing StreamingFIFO node"
             node = getCustomOp(node)
-            node.set_nodeattr("inFIFODepth", self.max_depth)
-            node.set_nodeattr("outFIFODepth", self.max_depth)
-            if node.onnx_node.op_type == "MatrixVectorActivation":
+            ifd = node.get_nodeattr("inFIFODepths")
+            ofd = node.get_nodeattr("outFIFODepths")
+            if self.max_depth is not None:
+                ifd = [self.max_depth] * len(ifd)
+                ofd = [self.max_depth] * len(ofd)
+            else:
+                # set each FIFO to its tensor size
+                # (except stream width hence the :-1)
+                for i in range(len(ifd)):
+                    ifd[i] = np.prod(node.get_folded_input_shape(i)[:-1])
+                for o in range(len(ofd)):
+                    ofd[o] = np.prod(node.get_folded_output_shape(o)[:-1])
+            node.set_nodeattr("inFIFODepths", ifd)
+            node.set_nodeattr("outFIFODepths", ofd)
+
+            if node.onnx_node.op_type in extw_optypes:
                 mmode = node.get_nodeattr("mem_mode")
                 if mmode == "external":
                     modified_fc_nodes.append(node.onnx_node.name)
                     node.set_nodeattr("mem_mode", "decoupled")
                     reset_implementation(node)
                     warnings.warn(
-                        "Changed mem_mode from external to decoupled for "
-                        + node.onnx_node.name
+                        "Changed mem_mode from external to decoupled for " + node.onnx_node.name
                     )
 
         # insert stream infrastructure (DWC/FIFO)
         model = model.transform(InsertDWC())
-        model = model.transform(InsertFIFO())
+        model = model.transform(InsertFIFO(create_shallow_fifos=True))
         model = model.transform(GiveUniqueNodeNames())
         model = model.transform(GiveReadableTensorNames())
 
         # gather FIFO names, check they are of expected depth
         fifos = {}
-        for node in model.graph.node:
-            if node.op_type == "StreamingFIFO":
-                fifos[node.name] = 0
-                node = getCustomOp(node)
-                # check depths and fix as necessary
-                if node.get_nodeattr("depth") != self.max_depth:
-                    node.set_nodeattr("depth", self.max_depth)
+        fifo_nodes = model.get_nodes_by_op_type("StreamingFIFO")
+        for node in fifo_nodes:
+            fifos[node.name] = 0
+            node = getCustomOp(node)
+            node.set_nodeattr("depth_monitor", 1)
+            node.set_nodeattr("impl_style", "rtl")
+            # check depths and fix as necessary
+            if (self.max_depth is not None) and (node.get_nodeattr("depth") != self.max_depth):
+                node.set_nodeattr("depth", self.max_depth)
 
         # insert FIFOs and do all transformations for RTLsim
         model = model.transform(AnnotateCycles())
@@ -285,75 +318,80 @@ def apply(self, model):
         model = model.transform(CreateStitchedIP(self.fpgapart, self.clk_ns))
         model.set_metadata_prop("exec_mode", "rtlsim")
 
-        # calculate input frequency (number of cycles for each input word)
-        first_node = getCustomOp(model.graph.node[0])
-        ncycles_per_input = max(
-            1,
-            int(
-                math.ceil(
-                    perf["max_cycles"]
-                    / (
-                        np.prod(first_node.get_folded_input_shape())
-                        / first_node.get_folded_input_shape()[-1]
+        if self.force_python_sim:
+            # do rtlsim in Python for FIFO sizing
+            # calculate input frequency (number of cycles for each input word)
+            first_node = getCustomOp(model.graph.node[0])
+            ncycles_per_input = max(
+                1,
+                int(
+                    math.ceil(
+                        perf["max_cycles"]
+                        / (
+                            np.prod(first_node.get_folded_input_shape())
+                            / first_node.get_folded_input_shape()[-1]
+                        )
                     )
-                )
-            ),
-        )
+                ),
+            )
 
-        # set sufficiently large threshold for 1 image to  fully execute and exit
-        ncycles = int(latency + max_cycles)
+            # set sufficiently large threshold for 1 image to  fully execute and exit
+            ncycles = int(latency + max_cycles)
 
-        # prepare pyverilator model
-        sim = pyverilate_stitched_ip(model)
+            # prepare pyverilator model
+            sim = pyverilate_stitched_ip(model)
 
-        reset_rtlsim(sim)
-        toggle_clk(sim)
+            reset_rtlsim(sim)
+            toggle_clk(sim)
 
-        # set all input valids to 0 and output readies to 1
-        # set input data to some constant
-        set_signal(sim, "tvalid", 0)
-        set_signal(sim, "tready", 1)
-        set_signal(sim, "tdata", 0)
+            # set all input valids to 0 and output readies to 1
+            # set input data to some constant
+            set_signal(sim, "tvalid", 0)
+            set_signal(sim, "tready", 1)
+            set_signal(sim, "tdata", 0)
+
+            output_detected = False
+            while ncycles > 0:
+                toggle_clk(sim)
+                # set/unset valids
+                if ncycles % ncycles_per_input == 0:
+                    set_signal(sim, "tvalid", 1)
+                else:
+                    set_signal(sim, "tvalid", 0)
 
-        output_detected = False
-        while ncycles > 0:
-            toggle_clk(sim)
-            # set/unset valids
-            if ncycles % ncycles_per_input == 0:
-                set_signal(sim, "tvalid", 1)
-            else:
-                set_signal(sim, "tvalid", 0)
-
-            # check/update all fifo counts
-            for key in fifos:
-                current_state = sim.internals["finn_design_i"][key]["inst"][
-                    key + "_" + key
-                ]["state"]
-                current_addr = sim.internals["finn_design_i"][key]["inst"][
-                    key + "_" + key
-                ]["addr"]
-                if current_state == 2:
-                    current_count = current_addr + 2
+                # since latency estimation is very pessimistic, detect first output
+                # and fast-forward the sim
+                if get_signal(sim, "tvalid") != 0 and not output_detected:
+                    ncycles = max_cycles
+                    output_detected = True
                 else:
-                    current_count = current_state
-                if current_count > fifos[key]:
-                    fifos[key] = current_count
-
-            # since latency estimation is very pessimistic, detect first output
-            # and fast-forward the sim
-            if get_signal(sim, "tvalid") != 0 and not output_detected:
-                ncycles = max_cycles
-                output_detected = True
+                    ncycles = ncycles - 1
+
+            if not output_detected:
+                warnings.warn("No output detected, calculated FIFO depths may not be correct")
+        else:
+            # do rtlsim in C++ for FIFO sizing
+            # determine # inputs for FIFO sizing according to topology type
+            swg_nodes = [x for x in model.graph.node if "ConvolutionInputGenerator" in x.op_type]
+            if len(swg_nodes) == 0:
+                # MLP, no layer overlap
+                # assuming half the nodes are now FIFOs, use half the # of
+                # nodes as # inputs to drive the imulation
+                n_inputs = int(len(model.graph.node) / 2)
             else:
-                ncycles = ncycles - 1
+                # convnet, two inputs are typically enough to fill entire
+                # layer pipeline due to overlaps
+                n_inputs = 2
+            sim = verilator_fifosim(model, n_inputs)
 
-        if not output_detected:
-            warnings.warn(
-                "No output detected, calculated FIFO depths may not be correct"
-            )
+        for ind, node in enumerate(fifo_nodes):
+            maxcount_name = "maxcount_%d" % ind
+            if ind == 0:
+                maxcount_name = "maxcount"
+            fifos[node.name] = sim[maxcount_name]
 
         # Apply depths back into the model;
-        # also set in/outFIFODepth to zero for non-FIFO
+        # also set in/outFIFODepths to zero for non-FIFO
         # nodes, preventing further FIFO insertion
         for node in model.graph.node:
             # set FIFO depth, reset FIFO implementation,
@@ -364,8 +402,14 @@ def apply(self, model):
                 depth = optimize_depth(fifos[node.name])
                 node_inst = getCustomOp(node)
                 node_inst.set_nodeattr("depth", depth)
+                node_inst.set_nodeattr("depth_monitor", 0)
+                # exception for top-level IO FIFOs which cause a bug in simulation
+                # (top-level IOs should not have impl_style=vivado)
+                toplevel_in = node.input[0] in [x.name for x in model.graph.input]
+                toplevel_out = node.output[0] in [x.name for x in model.graph.output]
+                toplevel_style_exception = toplevel_in or toplevel_out
                 # Set FIFO implementation/ram styles
-                if depth > self.max_qsrl_depth:
+                if (depth > self.max_qsrl_depth) and (not toplevel_style_exception):
                     node_inst.set_nodeattr("impl_style", "vivado")
                     node_inst.set_nodeattr("ram_style", self.vivado_ram_style)
                 else:
@@ -374,11 +418,10 @@ def apply(self, model):
                 reset_implementation(node_inst)
                 del fifos[node.name]
             else:
-                getCustomOp(node).set_nodeattr("inFIFODepth", 0)
-                getCustomOp(node).set_nodeattr("outFIFODepth", 0)
-                # for every FC node we changed from external to decoupled,
+                # (removed setting of node FIFO size attributes to 0 here)
+                # for every extw node we changed from external to decoupled,
                 # change back and reset implementation
-                if node.op_type == "MatrixVectorActivation":
+                if node.op_type in extw_optypes:
                     if node.name in modified_fc_nodes:
                         node_inst = getCustomOp(node)
                         node_inst.set_nodeattr("mem_mode", "external")
@@ -391,10 +434,174 @@ def apply(self, model):
 
         # handle custom sizing for SWG FIFOs if desired
         if self.swg_exception:
-            model = model.transform(
-                CapConvolutionFIFODepths(max_qsrl_depth=self.max_qsrl_depth)
-            )
+            model = model.transform(CapConvolutionFIFODepths(max_qsrl_depth=self.max_qsrl_depth))
         # remove shallow FIFOs
         model = model.transform(RemoveShallowFIFOs())
 
+        # reflect final values in attributes
+        for node in model.graph.node:
+            if node.op_type != "StreamingFIFO":
+                node_inst = getCustomOp(node)
+                fifodepth_in = []
+                for node_inp in node.input:
+                    prod = model.find_producer(node_inp)
+                    if prod is None:
+                        # no producer for this input
+                        if node_inp in [x.name for x in model.graph.input]:
+                            # top-level input with no FIFO
+                            fifodepth_in.append(0)
+                        else:
+                            # FIFO depth attr applies only to dynamic attributes
+                            pass
+                    else:
+                        # there is a producer for this input
+                        if prod.op_type == "StreamingFIFO":
+                            prod_inst = getCustomOp(prod)
+                            fifodepth_in.append(prod_inst.get_nodeattr("depth"))
+                        else:
+                            # explicitly no FIFO on this dynamic input
+                            fifodepth_in.append(0)
+                fifodepth_out = []
+                for node_out in node.output:
+                    cons = model.find_consumer(node_out)
+                    if cons is None:
+                        # no consumer for this output
+                        if node_out in [x.name for x in model.graph.output]:
+                            # top-level output with no FIFO
+                            fifodepth_out.append(0)
+                        else:
+                            # FIFO depth attr applies only to dynamic attributes
+                            pass
+                    else:
+                        # there is a consumer for this input
+                        if cons.op_type == "StreamingFIFO":
+                            cons_inst = getCustomOp(cons)
+                            fifodepth_out.append(cons_inst.get_nodeattr("depth"))
+                        else:
+                            # explicitly no FIFO on this dynamic output
+                            fifodepth_out.append(0)
+                node_inst.set_nodeattr("inFIFODepths", fifodepth_in)
+                node_inst.set_nodeattr("outFIFODepths", fifodepth_out)
+
+        return (model, False)
+
+
+def get_fifo_split_configs(depth, max_qsrl_depth=256, max_vivado_depth=32768):
+    """Break non-power-of-2 sized FIFO depths into several ones"""
+
+    def floor_pow2(x):
+        if (x & (x - 1) == 0) and x != 0:
+            return x
+        else:
+            return 1 << ((x - 1).bit_length() - 1)
+
+    def decompose_pow2(x):
+        if x <= max_qsrl_depth:
+            return [x]
+        else:
+            r = floor_pow2(x)
+            if x == r:
+                return [x]
+            else:
+                return [r, *decompose_pow2(x - r)]
+
+    ret = []
+    # trivial case: for small FIFOs, return as-is with rtl style
+    if depth <= max_qsrl_depth:
+        return [(depth, "rtl")]
+    # first pass: ensure max depth is respected
+    # (restricted by Vivado AXIS infra IP)
+    remainder = depth
+    while remainder != 0:
+        if remainder > max_vivado_depth:
+            ret.append(max_vivado_depth)
+            remainder -= max_vivado_depth
+        else:
+            ret.append(remainder)
+            remainder = 0
+    # second pass: break non-power-of-2 sized FIFOs
+    # into several ones
+
+    ret_pass2 = list(map(decompose_pow2, ret))
+    # unpack list of lists
+    ret_pass2 = [x for dec_list in ret_pass2 for x in dec_list]
+
+    # finally, add impl_style to each split FIFO
+    ret_final = []
+    for cand_depth in ret_pass2:
+        if cand_depth <= max_qsrl_depth:
+            ret_final.append((cand_depth, "rtl"))
+        else:
+            ret_final.append((cand_depth, "vivado"))
+
+    return ret_final
+
+
+class SplitLargeFIFOs(Transformation):
+    """Split large FIFOs before implementation, for two reasons:
+
+    - impl_style="vivado" supports a max depth of 32k. Any larger
+      FIFOs must be implemented as a sequence of smaller FIFOs.
+    - impl_style="vivado" requires power-of-two depths, which is
+      normally handled by rounding up to the nearest power-of-two.
+      So a FIFO of size 8196 normally gets rounded-up to a depth of
+      16384 and wastes a lot of resources. Here, instead, we split
+      this up into two FIFOs of depth 8192 + 4.
+
+    """
+
+    def __init__(self, max_qsrl_depth=256, max_vivado_depth=32768):
+        super().__init__()
+        self.max_qsrl_depth = max_qsrl_depth
+        self.max_vivado_depth = max_vivado_depth
+
+    def apply(self, model):
+        graph = model.graph
+        node_ind = 0
+        graph_modified = False
+        for node in graph.node:
+            node_ind += 1
+            if node.op_type == "StreamingFIFO":
+                n_inst = getCustomOp(node)
+                depth = n_inst.get_nodeattr("depth")
+                cfgs = get_fifo_split_configs(depth, self.max_qsrl_depth, self.max_vivado_depth)
+                if len(cfgs) > 1:
+                    fld_shape = n_inst.get_folded_output_shape()
+                    dtype = n_inst.get_nodeattr("dataType")
+                    ram_style = n_inst.get_nodeattr("ram_style")
+                    shape = model.get_tensor_shape(node.input[0])
+                    for i, (fifo_depth, impl_style) in enumerate(cfgs):
+                        if i == 0:
+                            inp = node.input[0]
+                        else:
+                            inp = node.name + "_" + str(i - 1) + "_out"
+                        if i == len(cfgs) - 1:
+                            outp = node.output[0]
+                        else:
+                            outp = node.name + "_" + str(i) + "_out"
+                            out_tensor = helper.make_tensor_value_info(
+                                outp, TensorProto.FLOAT, shape
+                            )
+                            graph.value_info.append(out_tensor)
+                            model.set_tensor_datatype(out_tensor.name, DataType[dtype])
+                        fifo_node = helper.make_node(
+                            "StreamingFIFO",
+                            [inp],
+                            [outp],
+                            domain="finn.custom_op.fpgadataflow",
+                            backend="fpgadataflow",
+                            depth=fifo_depth,
+                            folded_shape=fld_shape,
+                            dataType=dtype,
+                            impl_style=impl_style,
+                            ram_style=ram_style,
+                            name=node.name + "_" + str(i),
+                        )
+                        graph.node.insert(node_ind + i, fifo_node)
+
+                    graph.node.remove(node)
+                    graph_modified = True
+        if graph_modified:
+            model = model.transform(SortGraph())
+            model = model.transform(GiveReadableTensorNames())
         return (model, False)
diff --git a/src/finn/transformation/fpgadataflow/set_folding.py b/src/finn/transformation/fpgadataflow/set_folding.py
index 23943084ab..eca1053f8f 100644
--- a/src/finn/transformation/fpgadataflow/set_folding.py
+++ b/src/finn/transformation/fpgadataflow/set_folding.py
@@ -62,22 +62,25 @@ class SetFolding(Transformation):
 
     Notable exceptions and special behavior:
 
-    * When folding dense convolution/FC compute engines ("MVAU"/MatrixVectorActivation),
+    When folding dense convolution/FC compute engines ("MVAU"/MatrixVectorActivation),
     which have two attributes (PE and SIMD):
-        * first increases SIMD while weight stream width per PE is <= mvau_wwidth_max
-          (configurable in the SetFolding initializer, defaults to 36)
-        * then increases PE until the target is met or max PE reached
 
-    * When folding depthwise convolutions ("VVAU"/VectorVectorActivation)
+    * first increases SIMD while weight stream width per PE is <= mvau_wwidth_max
+      (configurable in the SetFolding initializer, defaults to 36)
+    * then increases PE until the target is met or max PE reached
+
+    When folding depthwise convolutions ("VVAU"/VectorVectorActivation)
     or spatial reduction ops (Pool_Batch):
-        * the producer of the node is expected to be a ConvolutionInputGenerator
-        with depthwise=1, whose SIMD value will be set equal to the PE value of
-        its consumer node
+
+    * the producer of the node is expected to be a ConvolutionInputGenerator
+      with depthwise=1, whose SIMD value will be set equal to the PE value of
+      its consumer node
+    * the VVAU also supports SIMD ("input window") parallelism next to
+      PE ("channels"), but current ConvInpGen limitations require PE to be fully
+      unfolded before SIMD is increased
     """
 
-    def __init__(
-        self, target_cycles_per_frame=1000, mvau_wwidth_max=36, two_pass_relaxation=True
-    ):
+    def __init__(self, target_cycles_per_frame=1000, mvau_wwidth_max=36, two_pass_relaxation=True):
         super().__init__()
         self.target_cycles_per_frame = target_cycles_per_frame
         self.mvau_wwidth_max = mvau_wwidth_max
@@ -103,12 +106,15 @@ def apply(self, model):
             "Thresholding_Batch",
         ]
         # these ops use SIMD parallelism, up to a max value of NumChannels
-        # ConvolutionInputGenerator has a special case when depthwise=1
+        # ConvolutionInputGenerator* has a special case when depthwise=1
+        # ConvolutionInputGenerator_rtl supports additional parallelism by
+        # setting parallel_window=1 mode after maxing out SIMD
         simd_ops = [
             "DownSampler",
             "FMPadding_Batch",
             "ConvolutionInputGenerator",
             "ConvolutionInputGenerator1D",
+            "ConvolutionInputGenerator_rtl",
         ]
         # these ops are preceded by depthwise SWG and have special behavior,
         # as explained in the SetFolding docstring
@@ -134,8 +140,7 @@ def apply(self, model):
                         # finish if target met
                         break
                     if (
-                        node_inst.get_weight_datatype().bitwidth()
-                        * node_inst.get_nodeattr("SIMD")
+                        node_inst.get_weight_datatype().bitwidth() * node_inst.get_nodeattr("SIMD")
                         > self.mvau_wwidth_max
                     ):
                         # revert if we've gone above width threshold
@@ -150,15 +155,36 @@ def apply(self, model):
                 max_pe = node_inst.get_nodeattr("Labels")
                 self.optimize_attribute_val(node_inst, max_pe, "PE")
             elif op_type in depthwise_op_exceptions:
+                # init/reset SIMD of VVAU
+                if op_type == "VectorVectorActivation":
+                    node_inst.set_nodeattr("SIMD", 1)
                 max_pe = node_inst.get_nodeattr("Channels")
                 self.optimize_attribute_val(node_inst, max_pe, "PE")
+                # increase SIMD for VVAU once PE is exhausted
+                pe = node_inst.get_nodeattr("PE")
+                cyc = node_inst.get_exp_cycles()
+                if (
+                    op_type == "VectorVectorActivation"
+                    and pe == max_pe
+                    and cyc > self.target_cycles_per_frame
+                ):
+                    max_simd = np.prod(node_inst.get_nodeattr("Kernel"))
+                    self.optimize_attribute_val(node_inst, max_simd, "SIMD")
                 # also set the folding of the upsteam DW SWU
                 # which must be identical to this node
                 swu_node = model.find_producer(node.input[0])
                 if swu_node.op_type.startswith("ConvolutionInputGenerator"):
                     swu_node_inst = getCustomOp(swu_node)
-                    pe = node_inst.get_nodeattr("PE")
                     swu_node_inst.set_nodeattr("SIMD", pe)
+                    # enable parallel_window mode of RTL SWG if needed
+                    if swu_node.op_type == "ConvolutionInputGenerator_rtl":
+                        if (
+                            op_type == "VectorVectorActivation"
+                            and node_inst.get_nodeattr("SIMD") > 1
+                        ):
+                            swu_node_inst.set_nodeattr("parallel_window", 1)
+                        else:
+                            swu_node_inst.set_nodeattr("parallel_window", 0)
                 else:
                     if op_type == "VectorVectorActivation":
                         ksize = np.prod(node_inst.get_nodeattr("Kernel"))
@@ -167,18 +193,25 @@ def apply(self, model):
                     else:
                         raise Exception("Undefined edge case for %s" % op_type)
                     if ksize != 1:  # pointwise vvau/pool lack a SWU
-                        raise Exception(
-                            "Expected SWU on DW op input, found " + swu_node.op_type
-                        )
+                        raise Exception("Expected SWU on DW op input, found " + swu_node.op_type)
             elif op_type in simd_ops:
-                if op_type in [
-                    "ConvolutionInputGenerator",
-                    "ConvolutionInputGenerator1D",
-                ]:
+                if op_type.startswith("ConvolutionInputGenerator"):
                     depthwise = node_inst.get_nodeattr("depthwise")
                     if depthwise == 0:
                         max_simd = node_inst.get_nodeattr("IFMChannels")
+                        # init/reset parallel_window mode of RTL SWG
+                        if op_type == "ConvolutionInputGenerator_rtl":
+                            node_inst.set_nodeattr("parallel_window", 0)
                         self.optimize_attribute_val(node_inst, max_simd, "SIMD")
+                        # enable parallel_window mode of RTL SWG if needed
+                        simd = node_inst.get_nodeattr("SIMD")
+                        cyc = node_inst.get_exp_cycles()
+                        if (
+                            op_type == "ConvolutionInputGenerator_rtl"
+                            and simd == max_simd
+                            and cyc > self.target_cycles_per_frame
+                        ):
+                            node_inst.set_nodeattr("parallel_window", 1)
                     else:
                         # depthwise SWGs are handled separately
                         continue
@@ -186,9 +219,7 @@ def apply(self, model):
                     max_simd = node_inst.get_nodeattr("NumChannels")
                     self.optimize_attribute_val(node_inst, max_simd, "SIMD")
             else:
-                warnings.warn(
-                    "SetFolding doesn't know how to handle op_type " + op_type
-                )
+                warnings.warn("SetFolding doesn't know how to handle op_type " + op_type)
 
         model = model.transform(GiveUniqueNodeNames())
         model = model.transform(AnnotateCycles())
diff --git a/src/finn/transformation/fpgadataflow/template_driver.py b/src/finn/transformation/fpgadataflow/template_driver.py
index 05ee6ad920..158825191e 100644
--- a/src/finn/transformation/fpgadataflow/template_driver.py
+++ b/src/finn/transformation/fpgadataflow/template_driver.py
@@ -135,5 +135,5 @@
         file.close()
         print("Results written to nw_metrics.txt")
     else:
-        raise Exception("Exec mode has to be set to remote_pynq or throughput_test")
+        raise Exception("Exec mode has to be set to execute or throughput_test")
 """
diff --git a/src/finn/transformation/fpgadataflow/templates.py b/src/finn/transformation/fpgadataflow/templates.py
index 757b1382c3..8bdfe40224 100644
--- a/src/finn/transformation/fpgadataflow/templates.py
+++ b/src/finn/transformation/fpgadataflow/templates.py
@@ -129,13 +129,17 @@
 } elseif {$BOARD == "Pynq-Z1"} {
     set ZYNQ_TYPE "zynq_7000"
     set_property board_part www.digilentinc.com:pynq-z1:part0:1.0 [current_project]
+} elseif {$BOARD == "KV260_SOM"} {
+    set ZYNQ_TYPE "zynq_us+"
+    set_property board_part xilinx.com:kv260_som:part0:1.3 [current_project]
 } else {
     puts "Unrecognized board"
 }
 
 create_bd_design "top"
 if {$ZYNQ_TYPE == "zynq_us+"} {
-    create_bd_cell -type ip -vlnv xilinx.com:ip:zynq_ultra_ps_e:3.4 zynq_ps
+    set zynq_ps_vlnv [get_property VLNV [get_ipdefs "xilinx.com:ip:zynq_ultra_ps_e:*"]]
+    create_bd_cell -type ip -vlnv $zynq_ps_vlnv zynq_ps
     apply_bd_automation -rule xilinx.com:bd_rule:zynq_ultra_ps_e -config {apply_board_preset "1" }  [get_bd_cells zynq_ps]
     #activate one slave port, deactivate the second master port
     set_property -dict [list CONFIG.PSU__USE__S_AXI_GP2 {1}] [get_bd_cells zynq_ps]
@@ -144,7 +148,8 @@
     set_property -dict [list CONFIG.PSU__OVERRIDE__BASIC_CLOCK {0}] [get_bd_cells zynq_ps]
     set_property -dict [list CONFIG.PSU__CRL_APB__PL0_REF_CTRL__FREQMHZ [expr int($FREQ_MHZ)]] [get_bd_cells zynq_ps]
 } elseif {$ZYNQ_TYPE == "zynq_7000"} {
-    create_bd_cell -type ip -vlnv xilinx.com:ip:processing_system7:5.5 zynq_ps
+    set zynq_ps_vlnv [get_property VLNV [get_ipdefs "xilinx.com:ip:processing_system7:*"]]
+    create_bd_cell -type ip -vlnv $zynq_ps_vlnv zynq_ps
     apply_bd_automation -rule xilinx.com:bd_rule:processing_system7 -config {make_external "FIXED_IO, DDR" apply_board_preset "1" Master "Disable" Slave "Disable" }  [get_bd_cells zynq_ps]
     set_property -dict [list CONFIG.PCW_USE_S_AXI_HP0 {1}] [get_bd_cells zynq_ps]
     set_property -dict [list CONFIG.PCW_FPGA0_PERIPHERAL_FREQMHZ [expr int($FREQ_MHZ)]] [get_bd_cells zynq_ps]
@@ -153,8 +158,10 @@
 }
 
 #instantiate axi interconnect, axi smartconnect
-create_bd_cell -type ip -vlnv xilinx.com:ip:axi_interconnect:2.1 axi_interconnect_0
-create_bd_cell -type ip -vlnv xilinx.com:ip:smartconnect:1.0 smartconnect_0
+set interconnect_vlnv [get_property VLNV [get_ipdefs -all "xilinx.com:ip:axi_interconnect:*" -filter design_tool_contexts=~*IPI*]]
+set smartconnect_vlnv [get_property VLNV [get_ipdefs "xilinx.com:ip:smartconnect:*"]]
+create_bd_cell -type ip -vlnv $interconnect_vlnv axi_interconnect_0
+create_bd_cell -type ip -vlnv $smartconnect_vlnv smartconnect_0
 #set number of axilite interfaces, and number of axi master interfaces
 set_property -dict [list CONFIG.NUM_SI $NUM_AXIMM] [get_bd_cells smartconnect_0]
 set_property -dict [list CONFIG.NUM_MI $NUM_AXILITE] [get_bd_cells axi_interconnect_0]
@@ -242,22 +249,6 @@
 close_project
 """
 
-alveo_run_sh_template = """#!/bin/bash
-
-if [ "$#" -ne 2 ]; then
-    echo "Usage: alveo_run.sh <exec_mode={execute, throughput_test}> <batch_size>"
-    exit -1
-fi
-
-cd $REMOTE_DEPLOY_DIR$
-eval "$(conda shell.bash hook)"
-conda activate $CONDA_ENV_NAME$
-source $REMOTE_XRT$/setup.sh
-export PLATFORM_REPO_PATHS=$REMOTE_PLATFORM_REPO_PATHS$
-python3.6 driver.py --exec_mode=$1 --batchsize=$2 --bitfile=$BITFILE$ \
-    --inputfile=input.npy --outputfile=output.npy --platform=alveo
-"""
-
 vitis_gen_xml_report_tcl_template = """
 open_project $VITIS_PROJ_PATH$/_x/link/vivado/vpl/prj/prj.xpr
 open_run impl_1
diff --git a/src/finn/transformation/fpgadataflow/vitis_build.py b/src/finn/transformation/fpgadataflow/vitis_build.py
index 855b30fe95..2fc0b2f3bb 100644
--- a/src/finn/transformation/fpgadataflow/vitis_build.py
+++ b/src/finn/transformation/fpgadataflow/vitis_build.py
@@ -56,9 +56,7 @@
 
 def _check_vitis_envvars():
     assert "VITIS_PATH" in os.environ, "VITIS_PATH must be set for Vitis"
-    assert (
-        "PLATFORM_REPO_PATHS" in os.environ
-    ), "PLATFORM_REPO_PATHS must be set for Vitis"
+    assert "PLATFORM_REPO_PATHS" in os.environ, "PLATFORM_REPO_PATHS must be set for Vitis"
     assert (
         "XILINX_XRT" in os.environ
     ), "XILINX_XRT must be set for Vitis, ensure the XRT env is sourced"
@@ -97,9 +95,7 @@ def apply(self, model):
         # NOTE: this assumes the graph is Vitis-compatible: max one axi lite interface
         # developed from instructions in UG1393 (v2019.2) and package_xo documentation
         # package_xo is responsible for generating the kernel xml
-        assert (
-            len(interfaces["axilite"]) <= 1
-        ), "CreateVitisXO supports max 1 AXI lite interface"
+        assert len(interfaces["axilite"]) <= 1, "CreateVitisXO supports max 1 AXI lite interface"
         axilite_intf_name = None
         if len(interfaces["axilite"]) == 1:
             axilite_intf_name = interfaces["axilite"][0]
@@ -114,14 +110,12 @@ def apply(self, model):
                 )
                 arg_id += 1
                 args_string.append(
-                    "{numReps:0:%s:%s:0x4:0x1C:uint:0}"
-                    % (str(arg_id), axilite_intf_name)
+                    "{numReps:0:%s:%s:0x4:0x1C:uint:0}" % (str(arg_id), axilite_intf_name)
                 )
                 arg_id += 1
             else:
                 args_string.append(
-                    "{numReps:0:%s:%s:0x4:0x10:uint:0}"
-                    % (str(arg_id), axilite_intf_name)
+                    "{numReps:0:%s:%s:0x4:0x10:uint:0}" % (str(arg_id), axilite_intf_name)
                 )
                 arg_id += 1
         for intf in interfaces["s_axis"] + interfaces["m_axis"]:
@@ -139,9 +133,10 @@ def apply(self, model):
         model.set_metadata_prop("vitis_xo", xo_path)
 
         # generate the package_xo command in a tcl script
-        package_xo_string = (
-            "package_xo -force -xo_path %s -kernel_name %s -ip_directory %s"
-            % (xo_path, self.ip_name, stitched_ip_dir)
+        package_xo_string = "package_xo -force -xo_path %s -kernel_name %s -ip_directory %s" % (
+            xo_path,
+            self.ip_name,
+            stitched_ip_dir,
         )
         for arg in args_string:
             package_xo_string += " -kernel_xml_args " + arg
@@ -255,9 +250,7 @@ def apply(self, model):
                         mem_type = "DDR"
                         mem_idx = 1
                     node_mem_port = "%s[%d]" % (mem_type, mem_idx)
-                config.append(
-                    "sp=%s.m_axi_gmem0:%s" % (instance_names[node.name], node_mem_port)
-                )
+                config.append("sp=%s.m_axi_gmem0:%s" % (instance_names[node.name], node_mem_port))
             # connect streams
             if producer is not None:
                 for i in range(len(node.input)):
@@ -281,14 +274,10 @@ def apply(self, model):
         # add Vivado physopt directives if desired
         if self.strategy == VitisOptStrategy.PERFORMANCE_BEST:
             config.append("[vivado]")
-            config.append(
-                "prop=run.impl_1.STEPS.OPT_DESIGN.ARGS.DIRECTIVE=ExploreWithRemap"
-            )
+            config.append("prop=run.impl_1.STEPS.OPT_DESIGN.ARGS.DIRECTIVE=ExploreWithRemap")
             config.append("prop=run.impl_1.STEPS.PLACE_DESIGN.ARGS.DIRECTIVE=Explore")
             config.append("prop=run.impl_1.STEPS.PHYS_OPT_DESIGN.IS_ENABLED=true")
-            config.append(
-                "prop=run.impl_1.STEPS.PHYS_OPT_DESIGN.ARGS.DIRECTIVE=Explore"
-            )
+            config.append("prop=run.impl_1.STEPS.PHYS_OPT_DESIGN.ARGS.DIRECTIVE=Explore")
             config.append("prop=run.impl_1.STEPS.ROUTE_DESIGN.ARGS.DIRECTIVE=Explore")
 
         config = "\n".join(config) + "\n"
@@ -341,9 +330,7 @@ def apply(self, model):
         with open(gen_rep_xml_sh, "w") as f:
             f.write("#!/bin/bash \n")
             f.write("cd {}\n".format(link_dir))
-            f.write(
-                "vivado -mode batch -source %s\n" % (link_dir + "/gen_report_xml.tcl")
-            )
+            f.write("vivado -mode batch -source %s\n" % (link_dir + "/gen_report_xml.tcl"))
             f.write("cd {}\n".format(working_dir))
         bash_command = ["bash", gen_rep_xml_sh]
         process_genxml = subprocess.Popen(bash_command, stdout=subprocess.PIPE)
@@ -358,16 +345,16 @@ class VitisBuild(Transformation):
     """Best-effort attempt at building the accelerator with Vitis.
     It assumes the model has only fpgadataflow nodes
 
-    fpga_part: string identifying the target FPGA
-    period_ns: target clock period
-    platform: target Alveo platform, one of ["U50", "U200", "U250", "U280"]
-    strategy: Vitis optimization strategy
-    enable_debug: add Chipscope to all AXI interfaces
-    floorplan_file: path to a JSON containing a dictionary with SLR assignments
-                    for each node in the ONNX graph. Must be parse-able by
-                    the ApplyConfig transform.
-    enable_link: enable linking kernels (.xo files), otherwise just synthesize
-                    them independently.
+    :parameter fpga_part: string identifying the target FPGA
+    :parameter period_ns: target clock period
+    :parameter platform: target Alveo platform, one of ["U50", "U200", "U250", "U280"]
+    :parameter strategy: Vitis optimization strategy
+    :parameter enable_debug: add Chipscope to all AXI interfaces
+    :parameter floorplan_file: path to a JSON containing a dictionary with
+        SLR assignments for each node in the ONNX graph.
+        Must be parse-able by the ApplyConfig transform.
+    :parameter enable_link: enable linking kernels (.xo files),
+        otherwise just synthesize them independently.
     """
 
     def __init__(
@@ -411,25 +398,20 @@ def apply(self, model):
         # Build each kernel individually
         sdp_nodes = model.get_nodes_by_op_type("StreamingDataflowPartition")
         for sdp_node in sdp_nodes:
+            prefix = sdp_node.name + "_"
             sdp_node = getCustomOp(sdp_node)
             dataflow_model_filename = sdp_node.get_nodeattr("model")
             kernel_model = ModelWrapper(dataflow_model_filename)
             kernel_model = kernel_model.transform(InsertFIFO())
             kernel_model = kernel_model.transform(RemoveUnusedTensors())
-            kernel_model = kernel_model.transform(GiveUniqueNodeNames())
+            kernel_model = kernel_model.transform(GiveUniqueNodeNames(prefix))
             kernel_model.save(dataflow_model_filename)
-            kernel_model = kernel_model.transform(
-                PrepareIP(self.fpga_part, self.period_ns)
-            )
+            kernel_model = kernel_model.transform(PrepareIP(self.fpga_part, self.period_ns))
             kernel_model = kernel_model.transform(HLSSynthIP())
             kernel_model = kernel_model.transform(
-                CreateStitchedIP(
-                    self.fpga_part, self.period_ns, sdp_node.onnx_node.name, True
-                )
-            )
-            kernel_model = kernel_model.transform(
-                CreateVitisXO(sdp_node.onnx_node.name)
+                CreateStitchedIP(self.fpga_part, self.period_ns, sdp_node.onnx_node.name, True)
             )
+            kernel_model = kernel_model.transform(CreateVitisXO(sdp_node.onnx_node.name))
             kernel_model.set_metadata_prop("platform", "alveo")
             kernel_model.save(dataflow_model_filename)
         # Assemble design from kernels
diff --git a/src/finn/transformation/move_reshape.py b/src/finn/transformation/move_reshape.py
index cec04a182b..ed553e7cee 100644
--- a/src/finn/transformation/move_reshape.py
+++ b/src/finn/transformation/move_reshape.py
@@ -54,9 +54,7 @@ def apply(self, model):
                                     fc_inst = getCustomOp(consumer)
                                     mw = fc_inst.get_nodeattr("MW")
                                     mh = fc_inst.get_nodeattr("MH")
-                                    (b, h, w, c) = model.get_tensor_shape(
-                                        transp_node.input[0]
-                                    )
+                                    (b, h, w, c) = model.get_tensor_shape(transp_node.input[0])
                                     # absorb transpose into weight matrix,
                                     # allowing FC layer to operate on the NHWC input
                                     W = model.get_initializer(consumer.input[1])
@@ -78,8 +76,6 @@ def apply(self, model):
                                         into subsequent node"
                                     )
                         else:
-                            warnings.warn(
-                                "Unsupported transpose node before flatten layer"
-                            )
+                            warnings.warn("Unsupported transpose node before flatten layer")
 
         return (model, graph_modified)
diff --git a/src/finn/transformation/qonnx/convert_qonnx_to_finn.py b/src/finn/transformation/qonnx/convert_qonnx_to_finn.py
index 967a127636..c921b3d472 100644
--- a/src/finn/transformation/qonnx/convert_qonnx_to_finn.py
+++ b/src/finn/transformation/qonnx/convert_qonnx_to_finn.py
@@ -56,19 +56,17 @@ class ConvertQONNXtoFINN(Transformation):
     is not converted to a MultiThreshold node.
 
     :param filter_function: Each candidate Quant and BinaryQant node is first evaluated
-    by this function. If the function returns False,
-    then the node is not converted to a MultiTrheshold node.
-    The function is given the model and candidate node as parameters.
-    Per default a filter function is inserted, which disables the conversion of
-    Quant nodes, which have a bit width of larger than 8.
-    Defaults to: default_filter_function_generator(max_multithreshold_bit_width=8)
+        by this function. If the function returns False,
+        then the node is not converted to a MultiTrheshold node.
+        The function is given the model and candidate node as parameters.
+        Per default a filter function is inserted, which disables the conversion of
+        Quant nodes, which have a bit width of larger than 8.
+        Defaults to: default_filter_function_generator(max_multithreshold_bit_width=8)
     """
 
     def __init__(
         self,
-        filter_function=default_filter_function_generator(
-            max_multithreshold_bit_width=8
-        ),
+        filter_function=default_filter_function_generator(max_multithreshold_bit_width=8),
     ):
         super().__init__()
         self._filter_function = filter_function
diff --git a/src/finn/transformation/qonnx/fold_quant_weights.py b/src/finn/transformation/qonnx/fold_quant_weights.py
index 80b6042d03..e027010271 100644
--- a/src/finn/transformation/qonnx/fold_quant_weights.py
+++ b/src/finn/transformation/qonnx/fold_quant_weights.py
@@ -57,13 +57,9 @@ def apply(self, model):
                 is_const_shape = (n.op_type == "Shape") and (ishape is not None)
                 if is_all_constant_inputs or is_const_shape:
                     # Check node validity
-                    if (
-                        n.op_type == "Quant"
-                        and not model.get_initializer(n.input[2]) == 0
-                    ):
+                    if n.op_type == "Quant" and not model.get_initializer(n.input[2]) == 0:
                         raise ValueError(
-                            "Only Quant nodes with zero-point == 0 "
-                            "are currently supported."
+                            "Only Quant nodes with zero-point == 0 " "are currently supported."
                         )
                     if model.is_fork_node(n):
                         raise ValueError(
@@ -73,8 +69,7 @@ def apply(self, model):
                     target_node = model.find_direct_successors(n)
                     if target_node is None:
                         raise RuntimeError(
-                            "Weights quantized with the Quant node must have "
-                            "a successor node."
+                            "Weights quantized with the Quant node must have " "a successor node."
                         )
                     else:
                         target_node = target_node[0]
@@ -126,10 +121,18 @@ def apply(self, model):
                         model.set_tensor_datatype(node_out, new_dtype)
 
                         # Reshape scale for Conv if required
+                        target_output_shape = model.get_tensor_shape(target_node.output[0])
                         if target_node.op_type == "Conv" and len(scale.shape) > 0:
-                            bias_shape = [1] * len(scale.shape)
-                            bias_shape[1] = -1
-                            scale = scale.reshape(bias_shape)
+                            conv_out_shape = [1] * len(target_output_shape)
+                            # only support per-output channel scaling
+                            # (i.e. all scale shape elems besides 0th must be 1s)
+                            if len(scale.shape) > 1:
+                                assert (
+                                    np.prod(scale.shape[1:]) == 1
+                                ), "Can't fold scale beyond per-out-channel granularity"
+                            # collect all scaling in channels dim (since we constrain)
+                            conv_out_shape[1] = -1
+                            scale = scale.reshape(conv_out_shape)
 
                         if scale.shape == (1,):
                             scale = scale[0]
@@ -150,9 +153,7 @@ def apply(self, model):
                                 "Can only constant fold scaled Quant weights "
                                 "if a successor exists."
                             )
-                        assert (
-                            len(successor) == 1
-                        ), "Only implemented for a single consumer"
+                        assert len(successor) == 1, "Only implemented for a single consumer"
                         successor = successor[0]
                         succ_output_name = successor.output[0]
 
diff --git a/src/finn/transformation/qonnx/infer_quant_avg_pool_2d.py b/src/finn/transformation/qonnx/infer_quant_avg_pool_2d.py
index 5a3f176f1f..52eb55355a 100644
--- a/src/finn/transformation/qonnx/infer_quant_avg_pool_2d.py
+++ b/src/finn/transformation/qonnx/infer_quant_avg_pool_2d.py
@@ -46,7 +46,7 @@ def _get_signed_from_upstream(model, trunc_node):
     # Check if the input of this node already has a FINN datatype
     signed = None
     inp_dt = model.get_tensor_datatype(node.input[0])
-    if inp_dt is not None and inp_dt is not DataType["FLOAT32"]:
+    if inp_dt is not None and inp_dt != DataType["FLOAT32"]:
         signed = inp_dt.signed()
     # Go further up the graph, since the datatype inference works top down
     # these nodes should either be sign preserving ops or they already have a
@@ -67,23 +67,23 @@ def _get_signed_from_upstream(model, trunc_node):
                 )
             next_node = next_node[0]
             out_dt = model.get_tensor_datatype(next_node.output[0])
-            if out_dt is not None and out_dt is not DataType["FLOAT32"]:
+            if out_dt is not None and out_dt != DataType["FLOAT32"]:
                 signed = out_dt.signed()
                 break
             # Special cases where the node has an internal or intrinsic datatype.
             if next_node.op_type == "MultiThreshold":
-                mt_inst = getCustomOp(next_node)
+                mt_inst = getCustomOp(next_node, onnx_opset_version=9)
                 out_dt = DataType[mt_inst.get_nodeattr("out_dtype")]
-                if out_dt is not None and out_dt is not DataType["FLOAT32"]:
+                if out_dt is not None and out_dt != DataType["FLOAT32"]:
                     signed = out_dt.signed()
                     break
             if next_node.op_type == "BipolarQuant":
                 signed = True
                 break
             if next_node.op_type == "Quant":
-                q_inst = getCustomOp(next_node)
+                q_inst = getCustomOp(next_node, onnx_opset_version=9)
                 out_dt = q_inst.get_integer_datatype(model)
-                if out_dt is not None and out_dt is not DataType["FLOAT32"]:
+                if out_dt is not None and out_dt != DataType["FLOAT32"]:
                     signed = out_dt.signed()
                     break
 
@@ -124,18 +124,10 @@ def apply(self, model):
             node_ind += 1
             if n.op_type == "AveragePool":
                 mul_node = model.find_direct_successors(n)
-                if (
-                    mul_node is not None
-                    and len(mul_node) == 1
-                    and mul_node[0].op_type == "Mul"
-                ):
+                if mul_node is not None and len(mul_node) == 1 and mul_node[0].op_type == "Mul":
                     mul_node = mul_node[0]
                     t_node = model.find_direct_successors(mul_node)
-                    if (
-                        t_node is not None
-                        and len(t_node) == 1
-                        and t_node[0].op_type == "Trunc"
-                    ):
+                    if t_node is not None and len(t_node) == 1 and t_node[0].op_type == "Trunc":
                         t_node = t_node[0]
                         running_node_index = node_ind
                         # Check node for compatibility
@@ -143,27 +135,16 @@ def apply(self, model):
                         k_s = get_by_name(n.attribute, "kernel_shape")
                         if k_s is None or len(k_s.ints) != 2 or len(set(k_s.ints)) != 1:
                             raise ValueError(
-                                "FINN only supports average pooling with "
-                                "2D square kernels."
+                                "FINN only supports average pooling with " "2D square kernels."
                             )
                         k_s = k_s.ints[0]
 
                         pads = get_by_name(n.attribute, "pads")
-                        if (
-                            pads is None
-                            or len(set(pads.ints)) != 1
-                            or pads.ints[0] != 0
-                        ):
-                            raise ValueError(
-                                "FINN dosn't support padding for average pooling."
-                            )
+                        if pads is None or len(set(pads.ints)) != 1 or pads.ints[0] != 0:
+                            raise ValueError("FINN dosn't support padding for average pooling.")
 
                         stride = get_by_name(n.attribute, "strides")
-                        if (
-                            stride is None
-                            or len(stride.ints) != 2
-                            or len(set(stride.ints)) != 1
-                        ):
+                        if stride is None or len(stride.ints) != 2 or len(set(stride.ints)) != 1:
                             raise ValueError(
                                 "FINN only supports 2D strides with equal values in "
                                 "each direction."
@@ -172,11 +153,7 @@ def apply(self, model):
 
                         # Mul node
                         mul_val = model.get_initializer(mul_node.input[1])
-                        if (
-                            mul_val is None
-                            or len(mul_val.shape) != 0
-                            or mul_val != k_s * k_s
-                        ):
+                        if mul_val is None or len(mul_val.shape) != 0 or mul_val != k_s * k_s:
                             raise ValueError(
                                 f"The Mul node after the AveragePool node must have "
                                 f"static initialization at the second input, "
@@ -188,10 +165,10 @@ def apply(self, model):
 
                         # Trunc node
                         rounding_mode = get_by_name(t_node.attribute, "rounding_mode")
-                        if rounding_mode is None or rounding_mode.s != b"FLOOR":
+                        normalized_mode_string = rounding_mode.s.upper()
+                        if rounding_mode is None or normalized_mode_string != b"FLOOR":
                             raise ValueError(
-                                "The Trunc node must have the rounding_mode "
-                                "set to 'FLOOR'."
+                                "The Trunc node must have the rounding_mode " "set to 'FLOOR'."
                             )
                         for inp in t_node.input[1:]:
                             if model.get_initializer(inp) is None:
@@ -207,13 +184,8 @@ def apply(self, model):
                                 f"the Trunc node, it currently is {zero_pt}."
                             )
                         trunc_in_bits = model.get_initializer(t_node.input[3]).flatten()
-                        trunc_out_bits = model.get_initializer(
-                            t_node.input[4]
-                        ).flatten()
-                        if (
-                            len(trunc_in_bits.shape) != 1
-                            or len(trunc_out_bits.shape) != 1
-                        ):
+                        trunc_out_bits = model.get_initializer(t_node.input[4]).flatten()
+                        if len(trunc_in_bits.shape) != 1 or len(trunc_out_bits.shape) != 1:
                             raise ValueError(
                                 f"Finn only supports scalar bit widths "
                                 f"for the Trunc node. The input bit width "
@@ -228,9 +200,7 @@ def apply(self, model):
                         # https://github.com/Xilinx/finn-base/blob/
                         # 7c2603a95e90e4de2575020e575c24eab6a15889/src/finn/custom_op/
                         # general/quantavgpool2d.py#L94
-                        ibits = math.floor(
-                            math.log(2**trunc_in_bits / (k_s * k_s), 2)
-                        )
+                        ibits = math.floor(math.log(2**trunc_in_bits / (k_s * k_s), 2))
                         # Get sign
                         signed = _get_signed_from_upstream(model, t_node)
                         # ToDo: Change this to NHWC,
diff --git a/src/finn/transformation/qonnx/qonnx_activation_handlers.py b/src/finn/transformation/qonnx/qonnx_activation_handlers.py
index a50a585077..323e391df4 100644
--- a/src/finn/transformation/qonnx/qonnx_activation_handlers.py
+++ b/src/finn/transformation/qonnx/qonnx_activation_handlers.py
@@ -52,9 +52,7 @@ def __init__(self, model: ModelWrapper, quant_node, quant_node_index: int):
         self._q_node = quant_node
         self._q_index = quant_node_index
 
-    @property
     @classmethod
-    @abstractmethod
     def valid_predecessor_op_types(self):
         """Defines which op types the preceding node is allowed to have for
         this type of activation.
@@ -284,25 +282,31 @@ class QuantReluHandler(QuantActBaseHandler):
     """Class for converting a quantized relu operation expressed in the QONNX
     dialect to the FINN ONNX dialect."""
 
-    valid_predecessor_op_types = [
-        "Relu",
-    ]
+    @classmethod
+    def valid_predecessor_op_types(self):
+        return [
+            "Relu",
+            "Selu",
+        ]
 
     def _check_compatibility(self):
         if self._q_node.op_type == "Quant":
             q_inst = getCustomOp(self._q_node)
             narrow = q_inst.get_nodeattr("narrow")
             signed = q_inst.get_nodeattr("signed")
-            if signed or narrow:
-                raise ValueError(
-                    "FINN only supports unsigned and non-narrow Quant nodes "
-                    "for Relu activations."
-                )
             if not self._model.get_initializer(self._q_node.input[2]) == 0:
                 raise ValueError(
                     "Only Quant nodes with zero-point == 0 "
                     "are currently supported for ReLu activations."
                 )
+            act_node = self._model.find_direct_predecessors(self._q_node)
+            act_node = act_node[0]
+            if act_node.op_type == "Relu":
+                if signed or narrow:
+                    raise ValueError(
+                        "FINN only supports unsigned and non-narrow Quant nodes "
+                        "for Relu activations."
+                    )
         elif self._q_node.op_type == "BipolarQuant":
             return
         else:
@@ -312,7 +316,31 @@ def _calculate_act_bias(self):
         # No bias allowed for Relu activations, see: https://github.com/Xilinx/
         # brevitas/blob/a5bfd6dc5e030f0047ac1ee47932b60e8e873e17/src/brevitas/
         # export/onnx/finn/handler/act.py#L48
-        bias = np.array([0.0], dtype=np_default_dtype)
+        act_node = self._model.find_direct_predecessors(self._q_node)
+        act_node = act_node[0]
+        if act_node.op_type == "Relu":
+            bias = np.array([0.0], dtype=np_default_dtype)
+        elif act_node.op_type == "Selu":
+            # Gather parameters
+            q_inst = getCustomOp(self._q_node)
+            if self._q_node.op_type == "Quant":
+                bit_width = self._model.get_initializer(self._q_node.input[3])
+                narrow = q_inst.get_nodeattr("narrow")
+            elif self._q_node.op_type == "BipolarQuant":
+                bit_width = 1.0
+            else:
+                raise RuntimeError("Got an unexpected quantizer node type")
+            # Calculate bias, see: https://github.com/Xilinx/brevitas/blob/
+            # a5bfd6dc5e030f0047ac1ee47932b60e8e873e17/src/brevitas/export/
+            # onnx/finn/handler/act.py#L64
+            if bit_width == 1.0:
+                bias = np.array([-0.5], dtype=np_default_dtype)
+            else:
+                if narrow:
+                    min_non_scaled_val = -(2 ** (bit_width - 1) - 1)
+                else:
+                    min_non_scaled_val = -(2 ** (bit_width - 1))
+                bias = np.array([min_non_scaled_val], dtype=np_default_dtype)
         return bias
 
     def _calculate_thresholds(self):
@@ -323,27 +351,49 @@ def _calculate_thresholds(self):
             bit_width = 1.0
         else:
             raise RuntimeError("Got an unexpected quantizer node type")
-        quant_scale = self._model.get_initializer(self._q_node.input[1]).astype(
-            np.float32
-        )
-        # q_inst = getCustomOp(self._q_node)
-        # narrow = q_inst.get_nodeattr("narrow")
+        quant_scale = self._model.get_initializer(self._q_node.input[1]).astype(np.float32)
+        act_node = self._model.find_direct_predecessors(self._q_node)
+        act_node = act_node[0]
+        if act_node.op_type == "Relu":
+            # Calculate thersholds, see: https://github.com/Xilinx/brevitas/blob/
+            # a5bfd6dc5e030f0047ac1ee47932b60e8e873e17/src/brevitas/export/
+            # onnx/finn/handler/act.py#L21
+            num_distinct_values = 2**bit_width
+            num_thresholds = int(num_distinct_values - 1)
+            flat_scale = quant_scale.flatten().astype(np.float32)
+            num_scale_channels = flat_scale.shape[0]
+            step = np.abs(flat_scale).astype(np.float32)
+            min_threshold = step / 2
+            thresholds = np.empty((num_scale_channels, num_thresholds), dtype=np_default_dtype)
+            for c in range(num_scale_channels):
+                for t in range(num_thresholds):
+                    thresholds[c][t] = min_threshold[c] + step[c] * t
 
-        # Calculate thersholds, see: https://github.com/Xilinx/brevitas/blob/
-        # a5bfd6dc5e030f0047ac1ee47932b60e8e873e17/src/brevitas/export/
-        # onnx/finn/handler/act.py#L21
-        num_distinct_values = 2**bit_width
-        num_thresholds = int(num_distinct_values - 1)
-        flat_scale = quant_scale.flatten().astype(np.float32)
-        num_scale_channels = flat_scale.shape[0]
-        step = np.abs(flat_scale).astype(np.float32)
-        min_threshold = step / 2
-        thresholds = np.empty(
-            (num_scale_channels, num_thresholds), dtype=np_default_dtype
-        )
-        for c in range(num_scale_channels):
-            for t in range(num_thresholds):
-                thresholds[c][t] = min_threshold[c] + step[c] * t
+        elif act_node.op_type == "Selu":
+            q_inst = getCustomOp(self._q_node)
+            narrow = q_inst.get_nodeattr("narrow")
+            if narrow:
+                num_distinct_values = 2**bit_width - 1
+            else:
+                num_distinct_values = 2**bit_width
+
+            num_thresholds = int(num_distinct_values - 1)
+            flat_scale = quant_scale.flatten().astype(np.float32)
+            num_scale_channels = flat_scale.shape[0]
+            scale = np.abs(flat_scale).astype(np.float32)
+            half_scale = scale / 2
+            # alpha and lambda
+            # from https://pytorch.org/docs/stable/generated/torch.nn.SELU.html
+            alpha = 1.6732632423543772848170429916717
+            selu_scale = 1.0507009873554804934193349852946
+            thresholds = np.empty((num_scale_channels, num_thresholds), dtype=np_default_dtype)
+            for c in range(num_scale_channels):
+                for t in range(num_thresholds):
+                    step = -1.0 + half_scale + scale[c] * t
+                    if step <= 0:
+                        thresholds[c][t] = np.log(step / (alpha * selu_scale) + 1)
+                    else:
+                        thresholds[c][t] = step / selu_scale
 
         # ToDo: The index 1 needs to be changed to -1 for the channels last format
         num_output_channels = self._model.get_tensor_shape(self._q_node.output[0])[1]
@@ -367,14 +417,13 @@ def _remove_activation_node(self, multi_threshold_node):
         act_node = self._model.find_direct_predecessors(self._q_node)
         if act_node is None:
             raise RuntimeError(
-                "For handling of Relu activations a predecesor to "
-                "the Quant node must exist."
+                "For handling of Relu activations a predecesor to " "the Quant node must exist."
             )
         act_node = act_node[0]
-        if not act_node.op_type == "Relu":
+        if act_node.op_type not in self.valid_predecessor_op_types():
             raise RuntimeError(
-                "The predecesor of the Quant node must be Relu for handling "
-                "of Relu activations."
+                "The predecesor of the Quant node must be Relu or Selu for handling "
+                "of activations."
             )
 
         # Reroute upstream tensor
@@ -391,15 +440,17 @@ class QuantIdentityHandler(QuantActBaseHandler):
     these are equivalent to quantized identity activations.
     """
 
-    valid_predecessor_op_types = [
-        "BatchNormalization",
-        "Sub",
-        "Add",
-        "Mul",
-        "Div",
-        "DebugMarker",
-        None,
-    ]
+    @classmethod
+    def valid_predecessor_op_types(self):
+        return [
+            "BatchNormalization",
+            "Sub",
+            "Add",
+            "Mul",
+            "Div",
+            "DebugMarker",
+            None,
+        ]
 
     def _check_compatibility(self):
         # Gather parameters to check
@@ -407,9 +458,7 @@ def _check_compatibility(self):
             q_inst = getCustomOp(self._q_node)
             signed = q_inst.get_nodeattr("signed")
             if not signed:
-                raise ValueError(
-                    "FINN only supports signed Quant nodes for identity activations."
-                )
+                raise ValueError("FINN only supports signed Quant nodes for identity activations.")
             if not self._model.get_initializer(self._q_node.input[2]) == 0:
                 raise ValueError(
                     "Only Quant nodes with zero-point == 0 "
@@ -478,9 +527,7 @@ def _calculate_thresholds(self):
             num_scale_channels = flat_scale.shape[0]
             step = np.abs(flat_scale)
             half_step = step / 2.0
-            thresholds = np.empty(
-                (num_scale_channels, num_thresholds), dtype=np_default_dtype
-            )
+            thresholds = np.empty((num_scale_channels, num_thresholds), dtype=np_default_dtype)
             # compute the value of the smallest threshold, we'll neg-bias all
             # generated thresholds by this much
             min_threshold = -half_step - step * ((num_thresholds // 2) - 1)
@@ -491,9 +538,7 @@ def _calculate_thresholds(self):
                     thresholds[c][t] = min_threshold[c] + step[c] * t
 
             # ToDo: The index 1 needs to be changed to -1 for the channels last format
-            num_output_channels = self._model.get_tensor_shape(self._q_node.output[0])[
-                1
-            ]
+            num_output_channels = self._model.get_tensor_shape(self._q_node.output[0])[1]
             final_shape = (num_output_channels, num_thresholds)
             if thresholds.shape != final_shape:
                 thresholds = np.broadcast_to(thresholds, final_shape)
@@ -515,9 +560,7 @@ def _calculate_act_scale(self):
         if bit_width != 1:
             scale = quant_scale
         else:
-            assert (
-                quant_scale.flatten().shape[0] == 1
-            ), "Unsupported BIPOLAR per channel scale"
+            assert quant_scale.flatten().shape[0] == 1, "Unsupported BIPOLAR per channel scale"
             assert quant_scale.flatten()[0] == 1.0, "Unsupported BIPOLAR scale != 1"
             scale = quant_scale * 2
         return scale
diff --git a/src/finn/transformation/qonnx/quant_act_to_multithreshold.py b/src/finn/transformation/qonnx/quant_act_to_multithreshold.py
index c52d69b0f0..1b1aea1bab 100644
--- a/src/finn/transformation/qonnx/quant_act_to_multithreshold.py
+++ b/src/finn/transformation/qonnx/quant_act_to_multithreshold.py
@@ -30,7 +30,10 @@
 import warnings
 from qonnx.transformation.base import Transformation
 
-from finn.transformation.qonnx.qonnx_activation_handlers import QuantActBaseHandler
+from finn.transformation.qonnx.qonnx_activation_handlers import (
+    QuantActBaseHandler,
+    QuantIdentityHandler,
+)
 
 
 def default_filter_function_generator(max_multithreshold_bit_width=8):
@@ -66,8 +69,7 @@ def filter_function(model, q_node):
 
 
 class ConvertQuantActToMultiThreshold(Transformation):
-    """
-    Converts Quant nodes in the activation path to MultiThreshold nodes.
+    """Converts Quant nodes in the activation path to MultiThreshold nodes.
 
     The optional keyword argument `filter_function`
     presents a way to control which Quant and BipolarQuant nodes in the activation path
@@ -75,19 +77,17 @@ class ConvertQuantActToMultiThreshold(Transformation):
     is not converted to a MultiThreshold node.
 
     :param filter_function: Each candidate Quant and BinaryQant node is first evaluated
-    by this function. If the function returns False,
-    then the node is not converted to a MultiTrheshold node.
-    The function is given the model and candidate node as parameters.
-    Per default a filter function is inserted, which disables the conversion of
-    Quant nodes, which have a bit width of larger than 8.
-    Defaults to: default_filter_function_generator(max_multithreshold_bit_width=8)
+        by this function. If the function returns False,
+        then the node is not converted to a MultiTrheshold node.
+        The function is given the model and candidate node as parameters.
+        Per default a filter function is inserted, which disables the conversion of
+        Quant nodes, which have a bit width of larger than 8.
+        Defaults to: default_filter_function_generator(max_multithreshold_bit_width=8)
     """
 
     def __init__(
         self,
-        filter_function=default_filter_function_generator(
-            max_multithreshold_bit_width=8
-        ),
+        filter_function=default_filter_function_generator(max_multithreshold_bit_width=8),
     ):
         super().__init__()
         self._filter_function = filter_function
@@ -110,11 +110,6 @@ def apply(self, model):
                     predecessor_op_type = predecessor[0].op_type
                 else:
                     predecessor_op_type = predecessor
-                if model.is_fork_node(n):
-                    raise ValueError(
-                        "Forking Quant/BipolarQuant nodes are currently "
-                        "not supported by FINN."
-                    )
                 if n.op_type == "Quant" and not model.get_initializer(n.input[2]) == 0:
                     raise ValueError(
                         "Only Quant nodes with zero-point == 0 are currently supported."
@@ -132,7 +127,7 @@ def apply(self, model):
                 # Check for possible ambiguity in handler selection
                 valid_predecessors = []
                 for cls in QuantActBaseHandler.__subclasses__():
-                    valid_predecessors.extend(cls.valid_predecessor_op_types)
+                    valid_predecessors.extend(cls.valid_predecessor_op_types())
                 if len(valid_predecessors) != len(set(valid_predecessors)):
                     raise RuntimeError(
                         "Two or more activation handlers declare the same "
@@ -143,16 +138,15 @@ def apply(self, model):
 
                 # Try to find a fitting handler for this Quant activation node
                 for handler_cls in QuantActBaseHandler.__subclasses__():
-                    if predecessor_op_type in handler_cls.valid_predecessor_op_types:
+                    if predecessor_op_type in handler_cls.valid_predecessor_op_types():
                         handler = handler_cls(model, n, node_ind)
                         break
                 else:
-                    raise ValueError(
-                        f"Quant nodes in the activation path and with predecessor "
-                        f"nodes of type {predecessor_op_type} are currently not "
-                        f"supported by FINN and can not be converted to "
-                        f"MultiThreshold nodes."
-                    )
+                    # fall back to QuantIdentityHandler here
+                    # it may still not work due to its particular restrictions,
+                    # but better than just erroring out without trying
+                    handler = QuantIdentityHandler(model, n, node_ind)
+
                 model = handler.replace_quant_node()
                 graph_modified = True
                 return (model, graph_modified)
diff --git a/src/finn/transformation/streamline/absorb.py b/src/finn/transformation/streamline/absorb.py
index 0299c4f4d8..e3e2468bba 100644
--- a/src/finn/transformation/streamline/absorb.py
+++ b/src/finn/transformation/streamline/absorb.py
@@ -80,9 +80,7 @@ def apply(self, model):
                     steps = T.shape[-1]
                     new_min = bias
                     new_max = steps + bias
-                    odt = DataType.get_smallest_possible(steps).name.replace(
-                        "UINT", "INT"
-                    )
+                    odt = DataType.get_smallest_possible(steps).name.replace("UINT", "INT")
                     odt = DataType[odt]
                     assert odt.allowed(new_max) and odt.allowed(
                         new_min
@@ -112,11 +110,7 @@ def apply(self, model):
         graph_modified = False
         for n in graph.node:
             node_ind += 1
-            if (
-                n.op_type == "Add"
-                and not model.is_fork_node(n)
-                and not model.is_join_node(n)
-            ):
+            if n.op_type == "Add" and not model.is_fork_node(n) and not model.is_join_node(n):
                 consumer = model.find_consumer(n.output[0])
                 if consumer is not None and consumer.op_type == "MultiThreshold":
                     add_weight_name = n.input[1]
@@ -153,11 +147,7 @@ def apply(self, model):
         graph_modified = False
         for n in graph.node:
             node_ind += 1
-            if (
-                n.op_type == "Mul"
-                and not model.is_fork_node(n)
-                and not model.is_join_node(n)
-            ):
+            if n.op_type == "Mul" and not model.is_fork_node(n) and not model.is_join_node(n):
                 mul_weight_name = n.input[1]
                 A = model.get_initializer(mul_weight_name)
                 assert A is not None, "Initializer for mul weights is not set."
@@ -203,9 +193,7 @@ def apply(self, model):
                 is_scalar = np.prod(A.shape) == 1
                 actual_ndims = len(tuple(filter(lambda x: x > 1, A.shape)))
                 is_1d = actual_ndims == 1
-                is_not_bipolar = (
-                    model.get_tensor_datatype(mul_weight_name) != DataType["BIPOLAR"]
-                )
+                is_not_bipolar = model.get_tensor_datatype(mul_weight_name) != DataType["BIPOLAR"]
                 is_signed = (A < 0).any()
                 if is_signed and (is_scalar or is_1d) and is_not_bipolar:
                     start_name = n.input[0]
@@ -219,9 +207,7 @@ def apply(self, model):
                     model.set_tensor_datatype(sign_mul_param_name, DataType["BIPOLAR"])
                     # replace original mul weight by magnitudes
                     model.set_initializer(mul_weight_name, np.abs(A))
-                    new_mul = oh.make_node(
-                        "Mul", [start_name, sign_mul_param_name], [middle_name]
-                    )
+                    new_mul = oh.make_node("Mul", [start_name, sign_mul_param_name], [middle_name])
                     n.input[0] = middle_name
                     graph.node.insert(node_ind - 1, new_mul)
                     graph_modified = True
@@ -338,13 +324,9 @@ def apply(self, model):
                             mt_cand.output[0]
                         )
                         # Create a new ValueInfoProto and set the shape
-                        model.set_tensor_shape(
-                            intermediate_tensor_name, intermediate_tensor_shape
-                        )
+                        model.set_tensor_shape(intermediate_tensor_name, intermediate_tensor_shape)
                         # Set the tensor layout
-                        model.set_tensor_layout(
-                            intermediate_tensor_name, DataLayout.NHWC
-                        )
+                        model.set_tensor_layout(intermediate_tensor_name, DataLayout.NHWC)
                         # Set the tensor FINN datatype
                         model.set_tensor_datatype(
                             intermediate_tensor_name, intermediate_tensor_finn_dtype
@@ -379,8 +361,7 @@ def apply(self, model):
         for n in graph.node:
             node_ind += 1
             if (
-                n.op_type == "Reshape"
-                and (model.get_initializer(n.input[1]) == [1, -1]).all()
+                n.op_type == "Reshape" and (model.get_initializer(n.input[1]) == [1, -1]).all()
             ) or n.op_type == "Flatten":
                 prod = model.find_producer(n.input[0])
                 if (
@@ -473,7 +454,7 @@ class AbsorbConsecutiveTransposes(Transformation):
     """Remove (Transpose -> Transpose) patterns when the input and output
     of the pattern have the same layout."""
 
-    def Are_opposite_permutations(self, perms1, perms2):
+    def are_opposite_permutations(self, perms1, perms2):
         if len(perms1) != len(perms2):
             return False
         assert 0 <= max(perms2) < len(perms2), "invalid permutation"
@@ -488,72 +469,42 @@ def Are_opposite_permutations(self, perms1, perms2):
     def apply(self, model):
         graph = model.graph
         graph_modified = False
-        for n in graph.node:
-            if n.op_type == "Transpose":
-                if model.is_fork_node(n):
-                    next_nodes = model.find_direct_successors(n)
-                    perms1 = list(get_by_name(n.attribute, "perm").ints)
-
-                    # check if all nodes after fork are opposite transposes
-                    all_opposite_transposes = True
-                    for next_node in next_nodes:
-                        if next_node is not None and next_node.op_type == "Transpose":
-                            perms2 = list(get_by_name(next_node.attribute, "perm").ints)
-                            if not self.Are_opposite_permutations(perms1, perms2):
-                                all_opposite_transposes = False
-                                break
-                        else:
-                            all_opposite_transposes = False
-                            break
-
-                    if not all_opposite_transposes:
-                        continue
-
-                    prod = model.find_producer(n.input[0])
-                    for next_node in next_nodes:
-                        # connect next_node's consumer input to n's producer output
-                        # TODO implement this to allow for forks as producers and
-                        # joins as consumers
-                        cons = model.find_consumer(next_node.output[0])
-                        cons.input[0] = prod.output[0]
-
-                        # remove consumer transpose
-                        graph.node.remove(next_node)
-
-                    # remove producer transpose
-                    graph.node.remove(n)
-                    graph_modified = True
-
-                else:
-                    next_node = model.find_consumer(n.output[0])
+        for node in graph.node:
+            if node.op_type == "Transpose":
+                next_nodes = model.find_consumers(node.output[0])
+                perms1 = list(get_by_name(node.attribute, "perm").ints)
+                if len(next_nodes) == 0:
+                    continue
+                # check if all nodes after fork are opposite transposes
+                all_opposite_transposes = True
+                for next_node in next_nodes:
                     if next_node is not None and next_node.op_type == "Transpose":
-                        perms1 = list(get_by_name(n.attribute, "perm").ints)
                         perms2 = list(get_by_name(next_node.attribute, "perm").ints)
-                        if self.Are_opposite_permutations(perms1, perms2):
-
-                            # connect next_node's consumer input to n's producer output
-                            # TODO implement this to allow for forks as producers
-                            consumers = model.find_direct_successors(next_node)
-                            prod = model.find_producer(n.input[0])
-                            if prod is not None:
-                                for cons in consumers:
-                                    for cons_in in cons.input:
-                                        if cons_in == next_node.output[0]:
-                                            prod.output[0] = cons_in
-                                            break
-                            else:
-                                # n.input[0] is top-level graph input
-                                # wire consumers directly to that
-                                for cons in consumers:
-                                    for i, iname in enumerate(cons.input):
-                                        if iname == next_node.output[0]:
-                                            cons.input[i] = n.input[0]
-
-                            # remove both transposes
-                            graph.node.remove(n)
-                            graph.node.remove(next_node)
+                        if not self.are_opposite_permutations(perms1, perms2):
+                            all_opposite_transposes = False
+                            break
+                    else:
+                        all_opposite_transposes = False
+                        break
+                if not all_opposite_transposes:
+                    continue
+                source_tensor = node.input[0]
+                for next_node in next_nodes:
+                    # connect next_node's consumers' appropriate input to n's input
+                    # TODO how to handle top-level outputs if any?
+                    nextnode_out = next_node.output[0]
+                    assert nextnode_out not in [x.name for x in model.graph.output]
+                    consumers = model.find_consumers(nextnode_out)
+                    for cons in consumers:
+                        for i, iname in enumerate(cons.input):
+                            if iname == nextnode_out:
+                                cons.input[i] = source_tensor
+                    # remove consumer transpose
+                    graph.node.remove(next_node)
+                # remove producer transpose
+                graph.node.remove(node)
+                graph_modified = True
 
-                            graph_modified = True
         if graph_modified:
             model = model.transform(InferDataTypes())
         return (model, graph_modified)
@@ -586,23 +537,17 @@ def apply(self, model):
                         if sizes is not None:
                             ishape = model.get_tensor_shape(mt_cand.input[0])
                             ns, cs, hs, ws = sizes / np.asarray(ishape)
-                            model.set_initializer(
-                                mt_cand.input[2], np.asarray([ns, cs, hs, ws])
-                            )
+                            model.set_initializer(mt_cand.input[2], np.asarray([ns, cs, hs, ws]))
                             mt_cand.input.remove(mt_cand.input[3])
                         # scales already specified, transpose indices to NHWC
                         scales = model.get_initializer(mt_cand.input[2])
                         assert scales is not None
                         ns, cs, hs, ws = scales
-                        model.set_initializer(
-                            mt_cand.input[2], np.asarray([ns, hs, ws, cs])
-                        )
+                        model.set_initializer(mt_cand.input[2], np.asarray([ns, hs, ws, cs]))
                         # get rid of first tranpose node
                         mt_cand.input[0] = node.input[0]
                         graph.node.remove(node)
-                        is_last_node = mt_cand.output[0] in [
-                            x.name for x in model.graph.output
-                        ]
+                        is_last_node = mt_cand.output[0] in [x.name for x in model.graph.output]
 
                         new_tensor_name = model.make_new_valueinfo_name()
                         if is_last_node:
@@ -612,7 +557,6 @@ def apply(self, model):
                             trans_input = mt_cand.output[0]
                             trans_output = new_tensor_name
                         # fix tensor shapes for Resize and Transpose
-                        # n, c, h, w = model.get_tensor_shape(mt_cand.input[0])
                         n, c, hx, wx = model.get_tensor_shape(mt_cand.output[0])
                         model.set_tensor_shape(trans_input, (n, hx, wx, c))
                         model.set_tensor_shape(trans_output, (n, c, hx, wx))
@@ -623,13 +567,13 @@ def apply(self, model):
                             [trans_output],
                             perm=[0, 3, 1, 2],
                         )
-                        graph.node.insert(node_ind + 1, new_transpose)
                         # rewire nodes
                         final_t_cands = model.find_consumers(mt_cand.output[0])
                         # rewire next nodes' inputs
                         for final_t_cand in final_t_cands:
                             final_t_cand.input[0] = trans_output
                         mt_cand.output[0] = trans_input
+                        graph.node.insert(node_ind + 1, new_transpose)
                         graph_modified = True
         if graph_modified:
             model = model.transform(InferDataTypes())
diff --git a/src/finn/transformation/streamline/reorder.py b/src/finn/transformation/streamline/reorder.py
index 9ff8a2173c..2e6aebf093 100644
--- a/src/finn/transformation/streamline/reorder.py
+++ b/src/finn/transformation/streamline/reorder.py
@@ -53,11 +53,7 @@ def apply(self, model):
         graph_modified = False
         for n in graph.node:
             node_ind += 1
-            if (
-                n.op_type == "Add"
-                and not model.is_fork_node(n)
-                and not model.is_join_node(n)
-            ):
+            if n.op_type == "Add" and not model.is_fork_node(n) and not model.is_join_node(n):
                 consumer = model.find_consumer(n.output[0])
                 if (
                     consumer is not None
@@ -73,9 +69,7 @@ def apply(self, model):
                     A = model.get_initializer(mul_weight_name)
                     B = model.get_initializer(add_weight_name)
                     if (A is None) or (B is None):
-                        warnings.warn(
-                            "Mul or add does not have constant params, skipping"
-                        )
+                        warnings.warn("Mul or add does not have constant params, skipping")
                         continue
                     start_name = n.input[0]
                     middle_name = n.output[0]
@@ -116,11 +110,7 @@ def apply(self, model):
         graph_modified = False
         for n in graph.node:
             node_ind += 1
-            if (
-                n.op_type == "Mul"
-                and not model.is_fork_node(n)
-                and not model.is_join_node(n)
-            ):
+            if n.op_type == "Mul" and not model.is_fork_node(n) and not model.is_join_node(n):
                 consumer = model.find_consumer(n.output[0])
                 if (
                     consumer is not None
@@ -174,11 +164,7 @@ def apply(self, model):
         graph_modified = False
         for n in graph.node:
             node_ind += 1
-            if (
-                n.op_type == "Add"
-                and not model.is_fork_node(n)
-                and not model.is_join_node(n)
-            ):
+            if n.op_type == "Add" and not model.is_fork_node(n) and not model.is_join_node(n):
                 consumer = model.find_consumer(n.output[0])
                 if (
                     consumer is not None
@@ -235,11 +221,7 @@ def apply(self, model):
         graph_modified = False
         for n in graph.node:
             node_ind += 1
-            if (
-                n.op_type == "Add"
-                and not model.is_fork_node(n)
-                and not model.is_join_node(n)
-            ):
+            if n.op_type == "Add" and not model.is_fork_node(n) and not model.is_join_node(n):
                 consumer = model.find_consumer(n.output[0])
                 if (
                     consumer is not None
@@ -317,11 +299,7 @@ def apply(self, model):
         graph_modified = False
         for n in graph.node:
             node_ind += 1
-            if (
-                n.op_type == "Mul"
-                and not model.is_fork_node(n)
-                and not model.is_join_node(n)
-            ):
+            if n.op_type == "Mul" and not model.is_fork_node(n) and not model.is_join_node(n):
                 consumer = model.find_consumer(n.output[0])
                 if (
                     consumer is not None
@@ -370,11 +348,7 @@ def apply(self, model):
         graph_modified = False
         for n in graph.node:
             node_ind += 1
-            if (
-                n.op_type == "Mul"
-                and not model.is_fork_node(n)
-                and not model.is_join_node(n)
-            ):
+            if n.op_type == "Mul" and not model.is_fork_node(n) and not model.is_join_node(n):
                 consumer = model.find_consumer(n.output[0])
                 if (
                     consumer is not None
@@ -436,11 +410,7 @@ def apply(self, model):
         graph_modified = False
         for n in graph.node:
             node_ind += 1
-            if (
-                n.op_type == "Mul"
-                and not model.is_fork_node(n)
-                and not model.is_join_node(n)
-            ):
+            if n.op_type == "Mul" and not model.is_fork_node(n) and not model.is_join_node(n):
                 consumer = model.find_consumer(n.output[0])
                 if (
                     consumer is not None
@@ -465,9 +435,7 @@ def apply(self, model):
                     maxpool_out_shape = model.get_tensor_shape(maxpool_out_name)
 
                     # do not support non-2D MaxPool
-                    kernel_shape = list(
-                        get_by_name(maxpool_node.attribute, "kernel_shape").ints
-                    )
+                    kernel_shape = list(get_by_name(maxpool_node.attribute, "kernel_shape").ints)
                     if len(kernel_shape) != 2:
                         continue
 
@@ -553,6 +521,8 @@ def apply(self, model):
                 # Other transform should handle that
                 if prod0 is None or prod1 is None or (prod0 == prod1):
                     continue
+                if len(prod0.input) < 2 or len(prod1.input) < 2:
+                    continue
                 init0 = model.get_initializer(prod0.input[1])
                 init1 = model.get_initializer(prod1.input[1])
                 # if either initializer is None, skip
@@ -673,9 +643,7 @@ def apply(self, model):
                         if ceil_mode is not None:
                             ceil_mode = ceil_mode.i
                         else:
-                            ceil_mode = (
-                                0  # default to ceil_mode=0 (equivalent to np.floor)
-                            )
+                            ceil_mode = 0  # default to ceil_mode=0 (equivalent to np.floor)
                         n.op_type = "MaxPoolNHWC"
                         n.domain = "qonnx.custom_op.general"
                         start_name = n.input[0]
@@ -700,9 +668,7 @@ def apply(self, model):
                         if ceil_mode is not None:
                             ceil_mode = ceil_mode.i
                         else:
-                            ceil_mode = (
-                                0  # default to ceil_mode=0 (equivalent to np.floor)
-                            )
+                            ceil_mode = 0  # default to ceil_mode=0 (equivalent to np.floor)
                         n.op_type = "MaxPoolNHWC"
                         n.domain = "qonnx.custom_op.general"
                         start_name = producer.input[0]
@@ -723,14 +689,85 @@ def apply(self, model):
         return (model, graph_modified)
 
 
+class MakeScaleResizeNHWC(Transformation):
+    """
+    Converts the inputs and outputs for all scales Resize and Upsample nodes
+    from NCHW to NHWC.
+    """
+
+    def apply(self, model):
+        graph = model.graph
+        node_ind = 0
+        for n in graph.node:
+            node_ind += 1
+            if n.op_type == "Upsample" or n.op_type == "Resize":
+                if model.get_tensor_layout(n.input[0]) != DataLayout.NCHW:
+                    warnings.warn(
+                        "%s: Input not NCHW. Can't operate transformation on node." % n.name
+                    )
+                    continue
+                consumer = model.find_consumer(n.output[0])
+                producer = model.find_producer(n.input[0])
+                if n.op_type == "Upsample":
+                    scales_ind = 1
+                else:
+                    scales_ind = 2
+                if producer is not None and producer.op_type == "Transpose":
+                    perms = list(get_by_name(producer.attribute, "perm").ints)
+                    if perms == [0, 3, 1, 2]:
+                        old_value = model.get_initializer(n.input[scales_ind])
+                        new_value = np.array(
+                            [old_value[idx] for idx in (0, 2, 3, 1)],
+                            dtype=np.dtype("float32"),
+                        )
+                        model.set_initializer(n.input[scales_ind], new_value)
+                        start_name = producer.input[0]
+                        mid_name = n.input[0]
+                        end_name = n.output[0]
+                        (b, hi, wi, c) = model.get_tensor_shape(start_name)
+                        (b, c, ho, wo) = model.get_tensor_shape(end_name)
+                        producer.input[0] = mid_name
+                        producer.output[0] = end_name
+                        n.input[0] = start_name
+                        n.output[0] = mid_name
+                        model.set_tensor_shape(mid_name, (b, ho, wo, c))
+                        model.set_tensor_shape(end_name, (b, c, ho, wo))
+                        graph.node.remove(producer)
+                        graph.node.insert(node_ind, producer)
+                elif consumer is not None and consumer.op_type == "Transpose":
+                    perms = list(get_by_name(consumer.attribute, "perm").ints)
+                    if perms == [0, 2, 3, 1]:
+                        old_value = model.get_initializer(n.input[scales_ind])
+                        new_value = np.array(
+                            [old_value[idx] for idx in (0, 2, 3, 1)],
+                            dtype=np.dtype("float32"),
+                        )
+                        model.set_initializer(n.input[scales_ind], new_value)
+                        start_name = n.input[0]
+                        mid_name = consumer.input[0]
+                        end_name = consumer.output[0]
+                        (b, c, hi, wi) = model.get_tensor_shape(start_name)
+                        (b, c, ho, wo) = model.get_tensor_shape(mid_name)
+                        consumer.input[0] = start_name
+                        consumer.output[0] = mid_name
+                        n.input[0] = mid_name
+                        n.output[0] = end_name
+                        model.set_tensor_shape(mid_name, (b, hi, wi, c))
+                        model.set_tensor_shape(end_name, (b, ho, wo, c))
+                        graph.node.remove(consumer)
+                        graph.node.insert(node_ind - 1, consumer)
+        return (model, False)
+
+
 class MoveOpPastFork(Transformation):
     """Move node operations past graph forks. Used when a node before a fork
     can be merged with nodes in the branches
     """
 
-    def __init__(self, op_name_list):
+    def __init__(self, op_name_list, get_attrs_fxn=lambda x: {}):
         super().__init__()
         self.ops_to_move = op_name_list
+        self.get_attrs_fxn = get_attrs_fxn
 
     def apply(self, model):
         graph = model.graph
@@ -744,12 +781,12 @@ def apply(self, model):
                 and model.is_fork_node(n)
                 and not model.is_join_node(n)
             ):
-
                 # Restrict this transform to operations with constant parameters
                 # Assuming parameters is in input 1
-                op_init_param = model.get_initializer(n.input[1])
-                if op_init_param is None:
-                    continue
+                if len(n.input) > 1:
+                    op_init_param = model.get_initializer(n.input[1])
+                else:
+                    op_init_param = None
 
                 # Check case when branches are empty and go
                 # to the same node
@@ -766,16 +803,20 @@ def apply(self, model):
 
                 for consumer_node in consumers[1:]:
                     # create new node
-                    new_param_name = model.make_new_valueinfo_name()
                     new_output_tensor_name = model.make_new_valueinfo_name()
+                    if op_init_param is None:
+                        new_inp_list = [n.input[0]]
+                    else:
+                        new_param_name = model.make_new_valueinfo_name()
+                        new_inp_list = [n.input[0], new_param_name]
+                        model.set_initializer(new_param_name, op_init_param)
+                    attrs = self.get_attrs_fxn(n)
+                    # TODO use copy of original node instead to get attrs?
                     new_node = oh.make_node(
-                        n.op_type,
-                        [n.input[0], new_param_name],
-                        [new_output_tensor_name],
+                        n.op_type, new_inp_list, [new_output_tensor_name], **attrs
                     )
                     graph.node.insert(node_ind, new_node)
                     node_ind += 1
-                    model.set_initializer(new_param_name, op_init_param)
 
                     # change consumer input tensor
                     graph.node.remove(consumer_node)
@@ -784,9 +825,7 @@ def apply(self, model):
                             consumer_node.input[idx] = new_output_tensor_name
                             break
                     else:
-                        raise Exception(
-                            "Consumer should have the current node output as input"
-                        )
+                        raise Exception("Consumer should have the current node output as input")
 
                     graph.node.insert(node_ind, consumer_node)
 
@@ -811,6 +850,11 @@ def __init__(self):
         super().__init__(["Add", "Mul"])
 
 
+class MoveTransposePastFork(MoveOpPastFork):
+    def __init__(self):
+        super().__init__(["Transpose"], lambda x: {"perm": get_by_name(x.attribute, "perm").ints})
+
+
 class MoveMaxPoolPastMultiThreshold(Transformation):
     """Move MaxPool nodes past MultiThreshold nodes on linear segments of the graph."""
 
@@ -832,9 +876,7 @@ def apply(self, model):
                     mt_out = consumer.output[0]
                     mt_odt = model.get_tensor_datatype(mt_out)
                     if mt_odt.signed() and has_padding:
-                        warnings.warn(
-                            "Skipping padded MaxPool + signed-output MultiThreshold"
-                        )
+                        warnings.warn("Skipping padded MaxPool + signed-output MultiThreshold")
                         continue
                     # check for non-decreasing thresholds and nonnegative
                     # scale factor in MultiThreshold
@@ -945,11 +987,7 @@ def apply(self, model):
         node_ind = 0
         for n in graph.node:
             node_ind += 1
-            if (
-                n.op_type == "Flatten"
-                and not model.is_fork_node(n)
-                and not model.is_join_node(n)
-            ):
+            if n.op_type == "Flatten" and not model.is_fork_node(n) and not model.is_join_node(n):
                 consumer = model.find_consumer(n.output[0])
                 if (
                     consumer is not None
@@ -1035,11 +1073,7 @@ def apply(self, model):
         graph_modified = False
         for n in graph.node:
             node_ind += 1
-            if (
-                n.op_type == "Transpose"
-                and not model.is_fork_node(n)
-                and not model.is_join_node(n)
-            ):
+            if n.op_type == "Transpose" and not model.is_fork_node(n) and not model.is_join_node(n):
                 consumer = model.find_consumer(n.output[0])
                 if (
                     consumer is not None
diff --git a/src/finn/transformation/streamline/round_thresholds.py b/src/finn/transformation/streamline/round_thresholds.py
index 601dab04cb..5ba5ee0ff5 100644
--- a/src/finn/transformation/streamline/round_thresholds.py
+++ b/src/finn/transformation/streamline/round_thresholds.py
@@ -57,8 +57,7 @@ def apply(self, model):
                     model.set_tensor_datatype(n.input[1], idtype)
                     graph_modified = True
                 if idtype.is_integer() and (
-                    (Tnew < (idtype.min() - 1)).any()
-                    or (Tnew > (idtype.max() + 1)).any()
+                    (Tnew < (idtype.min() - 1)).any() or (Tnew > (idtype.max() + 1)).any()
                 ):
                     # clip any large thresholds to input range + 1
                     Tnew = np.clip(Tnew, idtype.min() - 1, idtype.max() + 1)
diff --git a/src/finn/util/basic.py b/src/finn/util/basic.py
index 60f2446f59..6ec43cd587 100644
--- a/src/finn/util/basic.py
+++ b/src/finn/util/basic.py
@@ -31,6 +31,9 @@
 import sys
 import tempfile
 
+# test boards
+test_board_map = ["Pynq-Z1", "KV260_SOM", "ZCU104", "U250"]
+
 # mapping from PYNQ board names to FPGA part names
 pynq_part_map = dict()
 pynq_part_map["Ultra96"] = "xczu3eg-sbva484-1-e"
@@ -41,6 +44,8 @@
 pynq_part_map["ZCU104"] = "xczu7ev-ffvc1156-2-e"
 pynq_part_map["ZCU111"] = "xczu28dr-ffvg1517-2-e"
 pynq_part_map["RFSoC2x2"] = "xczu28dr-ffvg1517-2-e"
+pynq_part_map["KV260_SOM"] = "xck26-sfvc784-2LV-c"
+
 
 # native AXI HP port width (in bits) for PYNQ boards
 pynq_native_port_width = dict()
@@ -52,6 +57,7 @@
 pynq_native_port_width["ZCU104"] = 128
 pynq_native_port_width["ZCU111"] = 128
 pynq_native_port_width["RFSoC2x2"] = 128
+pynq_native_port_width["KV260_SOM"] = 128
 
 # Alveo device and platform mappings
 alveo_part_map = dict()
@@ -61,10 +67,10 @@
 alveo_part_map["U280"] = "xcu280-fsvh2892-2L-e"
 
 alveo_default_platform = dict()
-alveo_default_platform["U50"] = "xilinx_u50_gen3x16_xdma_201920_3"
-alveo_default_platform["U200"] = "xilinx_u200_xdma_201830_2"
-alveo_default_platform["U250"] = "xilinx_u250_gen3x16_xdma_2_1_202010_1"
-alveo_default_platform["U280"] = "xilinx_u280_xdma_201920_3"
+alveo_default_platform["U50"] = "xilinx_u50_gen3x16_xdma_5_202210_1"
+alveo_default_platform["U200"] = "xilinx_u200_gen3x16_xdma_2_202110_1"
+alveo_default_platform["U250"] = "xilinx_u250_gen3x16_xdma_4_1_202210_1"
+alveo_default_platform["U280"] = "xilinx_u280_gen3x16_xdma_1_202211_1"
 
 
 def get_rtlsim_trace_depth():
diff --git a/src/finn/util/create.py b/src/finn/util/create.py
index a8c2e67b38..af92d1cb8e 100644
--- a/src/finn/util/create.py
+++ b/src/finn/util/create.py
@@ -30,7 +30,11 @@
 from onnx import TensorProto, helper
 from qonnx.core.datatype import DataType
 from qonnx.core.modelwrapper import ModelWrapper
-from qonnx.util.basic import calculate_signed_dot_prod_range, gen_finn_dt_tensor
+from qonnx.util.basic import (
+    calculate_signed_dot_prod_range,
+    gen_finn_dt_tensor,
+    qonnx_make_model,
+)
 
 
 def hls_random_mlp_maker(layer_spec):
@@ -84,7 +88,7 @@ def hls_mlp_maker(layer_spec):
 
     graph = helper.make_graph(nodes=[], name="mlp", inputs=[], outputs=[])
 
-    model = helper.make_model(graph, producer_name="finn")
+    model = qonnx_make_model(graph, producer_name="finn")
     model = ModelWrapper(model)
 
     for lyr in layer_spec:
@@ -104,15 +108,11 @@ def hls_mlp_maker(layer_spec):
         odt = lyr["odt"]
 
         if i == 0:
-            global_in = helper.make_tensor_value_info(
-                current_in_name, TensorProto.FLOAT, [1, mw]
-            )
+            global_in = helper.make_tensor_value_info(current_in_name, TensorProto.FLOAT, [1, mw])
             model.graph.input.append(global_in)
 
         if i == len(layer_spec) - 1:
-            global_out = helper.make_tensor_value_info(
-                current_out_name, TensorProto.FLOAT, [1, mh]
-            )
+            global_out = helper.make_tensor_value_info(current_out_name, TensorProto.FLOAT, [1, mh])
             model.graph.output.append(global_out)
 
         # there are two ways to implement bipolar weights and inputs for
diff --git a/src/finn/util/data_packing.py b/src/finn/util/data_packing.py
index 65478d2540..7698850029 100644
--- a/src/finn/util/data_packing.py
+++ b/src/finn/util/data_packing.py
@@ -149,9 +149,7 @@ def pack_innermost_dim_as_hex_string(
         ndarray = np.asarray(ndarray, dtype=np.float32)
 
     def fun(x):
-        return array2hexstring(
-            x, dtype, pad_to_nbits, reverse=reverse_inner, prefix=prefix
-        )
+        return array2hexstring(x, dtype, pad_to_nbits, reverse=reverse_inner, prefix=prefix)
 
     return np.apply_along_axis(fun, ndarray.ndim - 1, ndarray)
 
@@ -220,7 +218,7 @@ def unpack_innermost_dim_from_hex_string(
         if conv_dtype == DataType["BIPOLAR"]:
             ar_list = [2 * x - 1 for x in ar_list]
         # interpret values as signed values
-        elif conv_dtype.name.startswith("INT"):
+        elif conv_dtype.signed() and conv_dtype.is_integer():
             mask = 2 ** (conv_dtype.bitwidth() - 1)
             ar_list = [-(x & mask) + (x & ~mask) for x in ar_list]
 
@@ -232,9 +230,7 @@ def unpack_innermost_dim_from_hex_string(
     return array
 
 
-def numpy_to_hls_code(
-    ndarray, dtype, hls_var_name, pack_innermost_dim=True, no_decl=False
-):
+def numpy_to_hls_code(ndarray, dtype, hls_var_name, pack_innermost_dim=True, no_decl=False):
     """Return C++ code representation of a numpy ndarray with FINN DataType
     dtype, using hls_var_name as the resulting C++ variable name. If
     pack_innermost_dim is specified, the innermost dimension of the ndarray
@@ -265,7 +261,7 @@ def numpy_to_hls_code(
     # define a function to convert a single element into a C++ init string
     # a single element can be a hex string if we are using packing
     def elem2str(x):
-        if type(x) == str or type(x) == np.str_ or type(x) == np.str:
+        if type(x) == str or type(x) == np.str_:
             return '%s("%s", 16)' % (hls_dtype, x)
         elif type(x) == np.float32:
             if dtype.is_integer():
@@ -311,9 +307,7 @@ def npy_to_rtlsim_input(input_file, input_dtype, pad_to_nbits, reverse_inner=Tru
     return packed_data
 
 
-def rtlsim_output_to_npy(
-    output, path, dtype, shape, packedBits, targetBits, reverse_inner=True
-):
+def rtlsim_output_to_npy(output, path, dtype, shape, packedBits, targetBits, reverse_inner=True):
     """Convert a flattened sequence of Python arbitrary-precision integers
     output into a NumPy array, saved as npy file at path. Each arbitrary-precision
     integer is assumed to be a packed array of targetBits-bit elements, which
@@ -418,9 +412,7 @@ def packed_bytearray_to_finnpy(
 
     """
 
-    if (
-        not issubclass(type(packed_bytearray), np.ndarray)
-    ) or packed_bytearray.dtype != np.uint8:
+    if (not issubclass(type(packed_bytearray), np.ndarray)) or packed_bytearray.dtype != np.uint8:
         raise Exception("packed_bytearray_to_finnpy needs NumPy uint8 arrays")
     if packed_bytearray.ndim == 0:
         raise Exception("packed_bytearray_to_finnpy expects at least 1D ndarray")
@@ -446,9 +438,7 @@ def packed_bytearray_to_finnpy(
     if reverse_endian:
         packed_bytearray = np.flip(packed_bytearray, axis=-1)
     # convert innermost dim of byte array to hex strings
-    packed_hexstring = np.apply_along_axis(
-        npbytearray2hexstring, packed_dim, packed_bytearray
-    )
+    packed_hexstring = np.apply_along_axis(npbytearray2hexstring, packed_dim, packed_bytearray)
     ret = unpack_innermost_dim_from_hex_string(
         packed_hexstring, dtype, output_shape, packed_bits, reverse_inner
     )
diff --git a/src/finn/util/gdrive.py b/src/finn/util/gdrive.py
deleted file mode 100644
index d525437300..0000000000
--- a/src/finn/util/gdrive.py
+++ /dev/null
@@ -1,65 +0,0 @@
-# Copyright (c) 2020, Xilinx
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice, this
-#   list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the following disclaimer in the documentation
-#   and/or other materials provided with the distribution.
-#
-# * Neither the name of FINN nor the names of its
-#   contributors may be used to endorse or promote products derived from
-#   this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import gspread
-import os
-import warnings
-from datetime import datetime
-
-from finn.util.basic import get_finn_root
-
-
-def upload_to_end2end_dashboard(data_dict):
-    gdrive_key = get_finn_root() + "/gdrive-key/service_account.json"
-    if not os.path.isfile(gdrive_key):
-        warnings.warn("Google Drive key not found, skipping dashboard upload")
-        return
-    gc = gspread.service_account(filename=gdrive_key)
-    spreadsheet = gc.open("finn-end2end-dashboard")
-    worksheet = spreadsheet.get_worksheet(0)
-    keys = list(data_dict.keys())
-    vals = list(data_dict.values())
-    # check against existing header
-    existing_keys = worksheet.row_values(1)
-    if not set(existing_keys).issuperset(set(keys)):
-        # create new worksheet
-        dtstr = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-        worksheet = spreadsheet.add_worksheet(
-            title="Dashboard " + dtstr, rows=10, cols=len(keys), index=0
-        )
-        # create header row with keys
-        worksheet.update("A1:1", [keys])
-        # freeze and make header bold
-        worksheet.freeze(rows=1)
-        worksheet.format("A1:1", {"textFormat": {"bold": True}})
-    # insert values into new row at appropriate positions
-    worksheet.insert_row([], index=2)
-    for i in range(len(keys)):
-        colind = existing_keys.index(keys[i])
-        col_letter = chr(ord("A") + colind)
-        worksheet.update("%s2" % col_letter, vals[i])
diff --git a/src/finn/util/imagenet.py b/src/finn/util/imagenet.py
index b4548bb352..1d63adf58b 100644
--- a/src/finn/util/imagenet.py
+++ b/src/finn/util/imagenet.py
@@ -137,8 +137,7 @@ def measure_topk(n_images, fxn_pre, fxn_exec, fxn_post, verbose=True, k=5):
 class_names = {
     0: "tench, Tinca tinca",
     1: "goldfish, Carassius auratus",
-    2: "great white shark, white shark, man-eater, man-eating shark, "
-    "Carcharodon carcharias",
+    2: "great white shark, white shark, man-eater, man-eating shark, " "Carcharodon carcharias",
     3: "tiger shark, Galeocerdo cuvieri",
     4: "hammerhead, hammerhead shark",
     5: "electric ray, crampfish, numbfish, torpedo",
@@ -184,8 +183,7 @@ def measure_topk(n_images, fxn_pre, fxn_exec, fxn_post, verbose=True, k=5):
     45: "Gila monster, Heloderma suspectum",
     46: "green lizard, Lacerta viridis",
     47: "African chameleon, Chamaeleo chamaeleon",
-    48: "Komodo dragon, Komodo lizard, dragon lizard, giant lizard, "
-    "Varanus komodoensis",
+    48: "Komodo dragon, Komodo lizard, dragon lizard, giant lizard, " "Varanus komodoensis",
     49: "African crocodile, Nile crocodile, Crocodylus niloticus",
     50: "American alligator, Alligator mississipiensis",
     51: "triceratops",
@@ -286,8 +284,7 @@ def measure_topk(n_images, fxn_pre, fxn_exec, fxn_post, verbose=True, k=5):
     144: "pelican",
     145: "king penguin, Aptenodytes patagonica",
     146: "albatross, mollymawk",
-    147: "grey whale, gray whale, devilfish, Eschrichtius gibbosus, "
-    "Eschrichtius robustus",
+    147: "grey whale, gray whale, devilfish, Eschrichtius gibbosus, " "Eschrichtius robustus",
     148: "killer whale, killer, orca, grampus, sea wolf, Orcinus orca",
     149: "dugong, Dugong dugon",
     150: "sea lion",
@@ -580,8 +577,7 @@ def measure_topk(n_images, fxn_pre, fxn_exec, fxn_post, verbose=True, k=5):
     433: "bathing cap, swimming cap",
     434: "bath towel",
     435: "bathtub, bathing tub, bath, tub",
-    436: "beach wagon, station wagon, wagon, estate car, beach waggon, "
-    "station waggon, waggon",
+    436: "beach wagon, station wagon, wagon, estate car, beach waggon, " "station waggon, waggon",
     437: "beacon, lighthouse, beacon light, pharos",
     438: "beaker",
     439: "bearskin, busby, shako",
@@ -636,8 +632,7 @@ def measure_topk(n_images, fxn_pre, fxn_exec, fxn_post, verbose=True, k=5):
     487: "cellular telephone, cellular phone, cellphone, cell, mobile phone",
     488: "chain",
     489: "chainlink fence",
-    490: "chain mail, ring mail, mail, chain armor, chain armour, ring armor, "
-    "ring armour",
+    490: "chain mail, ring mail, mail, chain armor, chain armour, ring armor, " "ring armour",
     491: "chain saw, chainsaw",
     492: "chest",
     493: "chiffonier, commode",
diff --git a/src/finn/util/platforms.py b/src/finn/util/platforms.py
index 8212cb5712..77dc591445 100644
--- a/src/finn/util/platforms.py
+++ b/src/finn/util/platforms.py
@@ -104,9 +104,7 @@ def compute_resources(self):
     def guide_resources(self):
         guide = []
         # TODO: assert limits is of correct size
-        guide_res = (
-            np.tile(np.array(self.compute_resources), (self.ndevices, 1))
-        ).astype(int)
+        guide_res = (np.tile(np.array(self.compute_resources), (self.ndevices, 1))).astype(int)
         for i in range(self.nslr * self.ndevices):
             # when in multi-FPGA mode, subtract cost of UDP connection from eth_slr
             local_slr = i % self.nslr
@@ -159,9 +157,7 @@ def compute_connection_cost(self):
                     xlocal[i][j] = 1
         # tile connection cost matrices for entire system
         for i in range(self.ndevices):
-            x[
-                i * self.nslr : (i + 1) * self.nslr, i * self.nslr : (i + 1) * self.nslr
-            ] = xlocal
+            x[i * self.nslr : (i + 1) * self.nslr, i * self.nslr : (i + 1) * self.nslr] = xlocal
         # set cost for ethernet connections, assuming daisy-chaining
         for i in range(self.ndevices - 1):
             x[i * self.nslr + self.eth_slr][(i + 1) * self.nslr + self.eth_slr] = 10
@@ -182,9 +178,7 @@ def compute_connection_resource(self):
                     slllocal[i][j] = self.sll_count[i][j]
         # tile connection cost matrices for entire system
         for i in range(self.ndevices):
-            sll[
-                i * self.nslr : (i + 1) * self.nslr, i * self.nslr : (i + 1) * self.nslr
-            ] = slllocal
+            sll[i * self.nslr : (i + 1) * self.nslr, i * self.nslr : (i + 1) * self.nslr] = slllocal
         # set cost for ethernet connections, assuming daisy-chaining
         eth = np.full((self.nslr * self.ndevices, self.nslr * self.ndevices), 0)
         # no Eth throughput constraints from one SLR to itself
diff --git a/src/finn/util/pyverilator.py b/src/finn/util/pyverilator.py
index f6a51da8e4..318ba7045e 100644
--- a/src/finn/util/pyverilator.py
+++ b/src/finn/util/pyverilator.py
@@ -26,32 +26,41 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import numpy as np
 import os
+import shutil
 from pyverilator import PyVerilator
+from qonnx.custom_op.registry import getCustomOp
 
-from finn.util.basic import get_rtlsim_trace_depth, make_build_dir
+from finn.util.basic import (
+    get_rtlsim_trace_depth,
+    launch_process_helper,
+    make_build_dir,
+)
 
 
-def pyverilate_stitched_ip(
-    model,
-    read_internal_signals=True,
-    disable_common_warnings=True,
-    extra_verilator_args=[],
-):
-    """Given a model with stitched IP, return a PyVerilator sim object.
-    Trace depth is also controllable, see get_rtlsim_trace_depth()
+def make_single_source_file(filtered_verilog_files, target_file):
+    """Dump all Verilog code used by stitched IP into a single file.
+    This is because large models with many files require a verilator
+    command line too long for bash on most systems"""
 
-    :param read_internal_signals  If set, it will be possible to examine the
-        internal (not only port) signals of the Verilog module, but this may
-        slow down compilation and emulation.
+    # concatenate all verilog code into a single file
+    with open(target_file, "w") as wf:
+        for vfile in filtered_verilog_files:
+            with open(vfile) as rf:
+                wf.write("//Added from " + vfile + "\n\n")
+                lines = rf.read()
+                for line in lines.split("\n"):
+                    # break down too-long lines, Verilator complains otherwise
+                    if len(line) > 20000:
+                        line = line.replace("&", "\n&")
+                    wf.write("\n" + line)
 
-    :param disable_common_warnings If set, disable the set of warnings that
-        Vivado-HLS-generated Verilog typically triggers in Verilator
-        (which can be very verbose otherwise)
 
-    """
-    if PyVerilator is None:
-        raise ImportError("Installation of PyVerilator is required.")
+def prepare_stitched_ip_for_verilator(model):
+    """Prepare sources from given stitched IP for verilator simulation, including
+    generating a single source file and replacing certain Vivado infrastructure
+    headers with Verilator-compatible ones"""
 
     vivado_stitch_proj_dir = model.get_metadata_prop("vivado_stitch_proj")
     with open(vivado_stitch_proj_dir + "/all_verilog_srcs.txt", "r") as f:
@@ -64,8 +73,6 @@ def file_to_basename(x):
         return os.path.basename(os.path.realpath(x))
 
     top_module_file_name = file_to_basename(model.get_metadata_prop("wrapper_filename"))
-    top_module_name = top_module_file_name.strip(".v")
-    build_dir = make_build_dir("pyverilator_ipstitched_")
 
     # dump all Verilog code to a single file
     # this is because large models with many files require
@@ -74,10 +81,27 @@ def file_to_basename(x):
     # are identical but in multiple directories (regslice_core.v)
 
     # remove duplicates from list by doing list -> set -> list
+    src_exts = [".v", ".sv"]
+
     all_verilog_files = list(
-        set(filter(lambda x: x.endswith(".v") or x.endswith(".sv"), all_verilog_srcs))
+        set(filter(lambda x: any(map(lambda y: x.endswith(y), src_exts)), all_verilog_srcs))
     )
 
+    verilog_header_dir = vivado_stitch_proj_dir + "/pyverilator_vh"
+    os.makedirs(verilog_header_dir, exist_ok=True)
+
+    # use custom version of axis infrastructure vh
+    # to enable Verilator to simulate AMD/Xilinx components (e.g DWC)
+    custom_vh = os.environ["FINN_ROOT"] + "/src/finn/qnn-data/verilog/custom_axis_infrastructure.vh"
+    shutil.copy(custom_vh, verilog_header_dir + "/axis_infrastructure_v1_1_0.vh")
+    for fn in all_verilog_srcs:
+        if fn.endswith(".vh"):
+            if "axis_infrastructure_v1_1_0.vh" in fn:
+                # skip, we use a custom version for this file without recursive gcd
+                continue
+            else:
+                shutil.copy(fn, verilog_header_dir)
+
     # remove all but one instances of regslice_core.v
     filtered_verilog_files = []
     remove_entry = False
@@ -86,15 +110,178 @@ def file_to_basename(x):
             if not remove_entry:
                 filtered_verilog_files.append(vfile)
             remove_entry = True
+        elif "swg_pkg" in vfile:
+            continue
         else:
             filtered_verilog_files.append(vfile)
 
-    # concatenate all verilog code into a single file
-    with open(vivado_stitch_proj_dir + "/" + top_module_file_name, "w") as wf:
-        for vfile in filtered_verilog_files:
-            with open(vfile) as rf:
-                wf.write("//Added from " + vfile + "\n\n")
-                wf.write(rf.read())
+    target_file = vivado_stitch_proj_dir + "/" + top_module_file_name
+    make_single_source_file(filtered_verilog_files, target_file)
+
+    return vivado_stitch_proj_dir
+
+
+def verilator_fifosim(model, n_inputs, max_iters=100000000):
+    """Create a Verilator model of stitched IP and use a simple C++
+    driver to drive the input stream. Useful for FIFO sizing, latency
+    and throughput measurement."""
+
+    vivado_stitch_proj_dir = prepare_stitched_ip_for_verilator(model)
+    verilog_header_dir = vivado_stitch_proj_dir + "/pyverilator_vh"
+    build_dir = make_build_dir("verilator_fifosim_")
+    fifosim_cpp_fname = os.environ["FINN_ROOT"] + "/src/finn/qnn-data/cpp/verilator_fifosim.cpp"
+    with open(fifosim_cpp_fname, "r") as f:
+        fifosim_cpp_template = f.read()
+    assert len(model.graph.input) == 1, "Only a single input stream is supported"
+    assert len(model.graph.output) == 1, "Only a single output stream is supported"
+    iname = model.graph.input[0].name
+    first_node = model.find_consumer(iname)
+    oname = model.graph.output[0].name
+    last_node = model.find_producer(oname)
+    assert (first_node is not None) and (last_node is not None), "Failed to find first/last nodes"
+    fnode_inst = getCustomOp(first_node)
+    lnode_inst = getCustomOp(last_node)
+    ishape_folded = fnode_inst.get_folded_input_shape()
+    oshape_folded = lnode_inst.get_folded_output_shape()
+
+    fifo_log = []
+    fifo_log_templ = '    results_file << "maxcount%s" << "\\t" '
+    fifo_log_templ += "<< to_string(top->maxcount%s) << endl;"
+    fifo_nodes = model.get_nodes_by_op_type("StreamingFIFO")
+    fifo_ind = 0
+    for fifo_node in fifo_nodes:
+        fifo_node = getCustomOp(fifo_node)
+        if fifo_node.get_nodeattr("depth_monitor") == 1:
+            suffix = "" if fifo_ind == 0 else "_%d" % fifo_ind
+            fifo_log.append(fifo_log_templ % (suffix, suffix))
+            fifo_ind += 1
+    fifo_log = "\n".join(fifo_log)
+
+    template_dict = {
+        "ITERS_PER_INPUT": np.prod(ishape_folded[:-1]),
+        "ITERS_PER_OUTPUT": np.prod(oshape_folded[:-1]),
+        "N_INPUTS": n_inputs,
+        "MAX_ITERS": max_iters,
+        "FIFO_DEPTH_LOGGING": fifo_log,
+    }
+
+    for key, val in template_dict.items():
+        fifosim_cpp_template = fifosim_cpp_template.replace(f"@{key}@", str(val))
+
+    with open(build_dir + "/verilator_fifosim.cpp", "w") as f:
+        f.write(fifosim_cpp_template)
+
+    which_verilator = shutil.which("verilator")
+    if which_verilator is None:
+        raise Exception("'verilator' executable not found")
+
+    # add defines to make certain XPM src files work with Verilator
+    xpm_args = []
+    xpm_args.append("-DDISABLE_XPM_ASSERTIONS")
+    xpm_args.append("-DOBSOLETE")
+    xpm_args.append("-DONESPIN")
+    xpm_args.append("--bbox-unsup")
+    vivado_path = os.environ["VIVADO_PATH"]
+    # additional SystemVerilog modules to make XPMs work with Verilator
+    xpm_memory = f"{vivado_path}/data/ip/xpm/xpm_memory/hdl/xpm_memory.sv"
+    xpm_cdc = f"{vivado_path}/data/ip/xpm/xpm_cdc/hdl/xpm_cdc.sv"
+    xpm_fifo = f"{vivado_path}/data/ip/xpm/xpm_fifo/hdl/xpm_fifo.sv"
+    swg_pkg = os.environ["FINN_ROOT"] + "/finn-rtllib/swg/swg_pkg.sv"
+    verilog_file_arg = [swg_pkg, "finn_design_wrapper.v", xpm_memory, xpm_cdc, xpm_fifo]
+
+    verilator_args = [
+        "perl",
+        which_verilator,
+        "-Wno-fatal",
+        "-Mdir",
+        build_dir,
+        "-y",
+        vivado_stitch_proj_dir,
+        "-y",
+        verilog_header_dir,
+        "--CFLAGS",
+        "--std=c++11",
+        "-O3",
+        "--x-assign",
+        "fast",
+        "--x-initial",
+        "fast",
+        "--noassert",
+        "--cc",
+        *verilog_file_arg,
+        "--top-module",
+        "finn_design_wrapper",
+        "--exe",
+        "verilator_fifosim.cpp",
+        "--threads",
+        "4",
+        *xpm_args,
+    ]
+
+    proc_env = os.environ.copy()
+    gcc_args = "-O3 -march=native"
+    proc_env["OPT_FAST"] = gcc_args
+    make_args = [
+        "make",
+        "-j4",
+        "-C",
+        build_dir,
+        "-f",
+        "Vfinn_design_wrapper.mk",
+        "Vfinn_design_wrapper",
+    ]
+
+    with open(build_dir + "/compile.sh", "w") as f:
+        f.write("#!/bin/bash" + "\n")
+        f.write("export OPT_FAST='%s'\n" % gcc_args)
+        f.write(" ".join(verilator_args) + "\n")
+        f.write(" ".join(make_args) + "\n")
+
+    launch_process_helper(verilator_args, cwd=build_dir)
+    launch_process_helper(make_args, proc_env=proc_env, cwd=build_dir)
+
+    sim_launch_args = ["./Vfinn_design_wrapper"]
+    launch_process_helper(sim_launch_args, cwd=build_dir)
+
+    with open(build_dir + "/results.txt", "r") as f:
+        results = f.read().strip().split("\n")
+    ret_dict = {}
+    for result_line in results:
+        key, val = result_line.split("\t")
+        ret_dict[key] = int(val)
+    return ret_dict
+
+
+def pyverilate_stitched_ip(
+    model,
+    read_internal_signals=True,
+    disable_common_warnings=True,
+    extra_verilator_args=[],
+):
+    """Given a model with stitched IP, return a PyVerilator sim object.
+    Trace depth is also controllable, see get_rtlsim_trace_depth()
+
+    :param read_internal_signals  If set, it will be possible to examine the
+        internal (not only port) signals of the Verilog module, but this may
+        slow down compilation and emulation.
+
+    :param disable_common_warnings If set, disable the set of warnings that
+        Vivado-HLS-generated Verilog typically triggers in Verilator
+        (which can be very verbose otherwise)
+
+    """
+    if PyVerilator is None:
+        raise ImportError("Installation of PyVerilator is required.")
+
+    vivado_stitch_proj_dir = prepare_stitched_ip_for_verilator(model)
+    verilog_header_dir = vivado_stitch_proj_dir + "/pyverilator_vh"
+
+    def file_to_basename(x):
+        return os.path.basename(os.path.realpath(x))
+
+    top_module_file_name = file_to_basename(model.get_metadata_prop("wrapper_filename"))
+    top_module_name = top_module_file_name.strip(".v")
+    build_dir = make_build_dir("pyverilator_ipstitched_")
 
     verilator_args = []
     # disable common verilator warnings that should be harmless but commonly occur
@@ -108,10 +295,22 @@ def file_to_basename(x):
     # force inlining of all submodules to ensure we can read internal signals properly
     if read_internal_signals:
         verilator_args += ["--inline-mult", "0"]
+    # add defines to make certain XPM src files work with Verilator
+    verilator_args.append("-DDISABLE_XPM_ASSERTIONS")
+    verilator_args.append("-DOBSOLETE")
+    verilator_args.append("-DONESPIN")
+    verilator_args.append("--bbox-unsup")
+    vivado_path = os.environ["VIVADO_PATH"]
+    # additional SystemVerilog modules to make XPMs work with Verilator
+    xpm_memory = f"{vivado_path}/data/ip/xpm/xpm_memory/hdl/xpm_memory.sv"
+    xpm_cdc = f"{vivado_path}/data/ip/xpm/xpm_cdc/hdl/xpm_cdc.sv"
+    xpm_fifo = f"{vivado_path}/data/ip/xpm/xpm_fifo/hdl/xpm_fifo.sv"
+
+    swg_pkg = os.environ["FINN_ROOT"] + "/finn-rtllib/swg/swg_pkg.sv"
 
     sim = PyVerilator.build(
-        top_module_file_name,
-        verilog_path=[vivado_stitch_proj_dir],
+        [swg_pkg, top_module_file_name, xpm_fifo, xpm_memory, xpm_cdc],
+        verilog_path=[vivado_stitch_proj_dir, verilog_header_dir],
         build_dir=build_dir,
         trace_depth=get_rtlsim_trace_depth(),
         top_module_name=top_module_name,
diff --git a/src/finn/util/test.py b/src/finn/util/test.py
index f5d3b1c30b..2115e058a8 100644
--- a/src/finn/util/test.py
+++ b/src/finn/util/test.py
@@ -26,10 +26,9 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import pkg_resources as pk
-
 import pytest
 
+import importlib_resources as importlib
 import numpy as np
 import onnx
 import onnx.numpy_helper as nph
@@ -91,8 +90,8 @@ def soft_verify_topk(invec, idxvec, k):
     """Check that the topK indices provided actually point to the topK largest
     values in the input vector"""
     np_topk = np.flip(invec.flatten().argsort())[:k]
-    soft_expected = invec.flatten()[np_topk.astype(np.int).flatten()]
-    soft_produced = invec.flatten()[idxvec.astype(np.int).flatten()]
+    soft_expected = invec.flatten()[np_topk.astype(np.int_).flatten()]
+    soft_produced = invec.flatten()[idxvec.astype(np.int_).flatten()]
     return (soft_expected == soft_produced).all()
 
 
@@ -106,37 +105,26 @@ def load_test_checkpoint_or_skip(filename):
         pytest.skip(filename + " not found from previous test step, skipping")
 
 
-def get_build_env(kind, target_clk_ns):
+def get_build_env(board, target_clk_ns):
     """Get board-related build environment for testing.
-    - kind = either zynq or alveo.
+    - board = any from pynq_part_map or alveo_part_map
     """
     ret = {}
-    if kind == "zynq":
-        ret["board"] = os.getenv("PYNQ_BOARD", default="Pynq-Z1")
-        ret["part"] = pynq_part_map[ret["board"]]
-        ret["ip"] = os.getenv("PYNQ_IP", "")
-        ret["username"] = os.getenv("PYNQ_USERNAME", "xilinx")
-        ret["password"] = os.getenv("PYNQ_PASSWORD", "xilinx")
-        ret["port"] = os.getenv("PYNQ_PORT", 22)
-        ret["target_dir"] = os.getenv("PYNQ_TARGET_DIR", "/home/xilinx/finn")
-        ret["build_fxn"] = ZynqBuild(ret["board"], target_clk_ns)
-    elif kind == "alveo":
-        ret["board"] = os.getenv("ALVEO_BOARD", default="U250")
-        ret["part"] = alveo_part_map[ret["board"]]
-        ret["platform"] = alveo_default_platform[ret["board"]]
-        ret["ip"] = os.getenv("ALVEO_IP", "")
-        ret["username"] = os.getenv("ALVEO_USERNAME", "")
-        ret["password"] = os.getenv("ALVEO_PASSWORD", "")
-        ret["port"] = os.getenv("ALVEO_PORT", 22)
-        ret["target_dir"] = os.getenv("ALVEO_TARGET_DIR", "/tmp/finn_alveo_deploy")
+    if board in pynq_part_map:
+        ret["kind"] = "zynq"
+        ret["part"] = pynq_part_map[board]
+        ret["build_fxn"] = ZynqBuild(board, target_clk_ns)
+    elif board in alveo_part_map:
+        ret["kind"] = "alveo"
+        ret["part"] = alveo_part_map[board]
         ret["build_fxn"] = VitisBuild(
             ret["part"],
             target_clk_ns,
-            ret["platform"],
+            alveo_default_platform[board],
             strategy=VitisOptStrategy.BUILD_SPEED,
         )
     else:
-        raise Exception("Unknown test build environment spec")
+        raise Exception("Unknown board specified")
     return ret
 
 
@@ -148,10 +136,9 @@ def get_example_input(topology):
         onnx_tensor = onnx.load_tensor_from_string(raw_i)
         return nph.to_array(onnx_tensor)
     elif topology == "cnv":
-        fn = pk.resource_filename(
-            "finn.qnn-data", "cifar10/cifar10-test-data-class3.npz"
-        )
-        input_tensor = np.load(fn)["arr_0"].astype(np.float32)
+        ref = importlib.files("finn.qnn-data") / "cifar10/cifar10-test-data-class3.npz"
+        with importlib.as_file(ref) as fn:
+            input_tensor = np.load(fn)["arr_0"].astype(np.float32)
         return input_tensor
     else:
         raise Exception("Unknown topology, can't return example input")
@@ -180,6 +167,7 @@ def execute_parent(parent_path, child_path, input_tensor_npy, return_full_ctx=Fa
     sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0]
     sdp_node = getCustomOp(sdp_node)
     sdp_node.set_nodeattr("model", child_path)
+    sdp_node.set_nodeattr("return_full_exec_context", 1 if return_full_ctx else 0)
     ret = execute_onnx(parent_model, {iname: input_tensor_npy}, True)
     if return_full_ctx:
         return ret
diff --git a/src/finn/util/vcd.py b/src/finn/util/vcd.py
index aaeb3ab920..69dd82c5ea 100644
--- a/src/finn/util/vcd.py
+++ b/src/finn/util/vcd.py
@@ -69,7 +69,7 @@ def get_fifo_count_max(vcd_file, fifo_count_signal):
     assert len(d) != 0, "FIFO count signal not found"
     events = list(d.values())[0]["tv"]
     max = 0
-    for (time, val) in events:
+    for time, val in events:
         current = int(val, base=2)
         if current > max:
             max = current
@@ -101,19 +101,21 @@ def get_stream_if_stats(vcd_file, if_base_name):
     <stream_state>: (<num_samples>, <fraction_of_time>),
 
     where <stream_state> is the combination of (V)alid/(R)eady values,
-    <num_samples> is the approximate number of rising clock edges spent in <state>
-    , and <fraction_of_time> is the fraction of <num_samples> to total
+    <num_samples> is the approximate number of rising clock edges spent in <state>,
+    and <fraction_of_time> is the fraction of <num_samples> to total
     amount of time recorded by the trace.
 
     Example:
-    {"{'V': 0, 'R': 0}": (5, 0.0006060606060606061),
-     "{'V': 1, 'R': 0}": (0, 0.0),
-     "{'V': 0, 'R': 1}": (7605, 0.9218181818181819),
-     "{'V': 1, 'R': 1}": (640, 0.07757575757575758)}
-
+    {
+    "{'V': 0, 'R': 0}": (5, 0.0006060606060606061),
+    "{'V': 1, 'R': 0}": (0, 0.0),
+    "{'V': 0, 'R': 1}": (7605, 0.9218181818181819),
+    "{'V': 1, 'R': 1}": (640, 0.07757575757575758)
+    }
     Here we can see the stream was transmitting values 7.7% of the time,
     and 9.2% of the time there was no incoming data (valid 0, ready 1)
     """
+
     if_valid = if_base_name + vname
     if_ready = if_base_name + rname
     v = VCDVCD(vcd_file, signals=[if_valid], store_tvs=True)
@@ -138,7 +140,7 @@ def get_stream_if_stats(vcd_file, if_base_name):
     status = {"V": 0, "R": 0}
     last_time = 0
     total_rising_clock_edges = 0
-    for (sig, time, val) in events:
+    for sig, time, val in events:
         # pyverilator generates 5 time units per sample
         time = time / 5
         # pyverilator generates 4 samples per clock period
diff --git a/tests/brevitas/king_charles.jpg b/tests/brevitas/king_charles.jpg
index c1400a484e..d3639a69e9 100755
Binary files a/tests/brevitas/king_charles.jpg and b/tests/brevitas/king_charles.jpg differ
diff --git a/tests/brevitas/test_brevitas_avg_pool_export.py b/tests/brevitas/test_brevitas_avg_pool_export.py
index 669601ecb6..053b632221 100644
--- a/tests/brevitas/test_brevitas_avg_pool_export.py
+++ b/tests/brevitas/test_brevitas_avg_pool_export.py
@@ -30,10 +30,8 @@
 import numpy as np
 import os
 import torch
-from brevitas.export import FINNManager
-from brevitas.export.onnx.generic.manager import BrevitasONNXManager
-from brevitas.nn import QuantAvgPool2d
-from brevitas.quant_tensor import QuantTensor
+from brevitas.export import export_qonnx
+from brevitas.nn import QuantIdentity, QuantReLU, TruncAvgPool2d
 from qonnx.core.datatype import DataType
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.transformation.infer_datatypes import InferDataTypes
@@ -48,10 +46,9 @@
 
 
 @pytest.mark.brevitas_export
-@pytest.mark.parametrize("QONNX_export", [False, True])
 @pytest.mark.parametrize("kernel_size", [2, 3])
 @pytest.mark.parametrize("stride", [1, 2])
-@pytest.mark.parametrize("signed", [True, False])
+@pytest.mark.parametrize("signed", [True])  # TODO: Add unsigned test case
 @pytest.mark.parametrize("bit_width", [2, 4])
 @pytest.mark.parametrize("input_bit_width", [4, 8, 16])
 @pytest.mark.parametrize("channels", [2, 4])
@@ -64,79 +61,56 @@ def test_brevitas_avg_pool_export(
     input_bit_width,
     channels,
     idim,
-    QONNX_export,
 ):
-    export_onnx_path = base_export_onnx_path.replace(
-        ".onnx", f"test_QONNX-{QONNX_export}.onnx"
-    )
-    quant_avgpool = QuantAvgPool2d(
+    export_onnx_path = base_export_onnx_path.replace(".onnx", "test_QONNX.onnx")
+    if signed:
+        quant_node = QuantIdentity(
+            bit_width=input_bit_width,
+            return_quant_tensor=True,
+        )
+    else:
+        quant_node = QuantReLU(
+            bit_width=input_bit_width,
+            return_quant_tensor=True,
+        )
+    quant_avgpool = TruncAvgPool2d(
         kernel_size=kernel_size,
         stride=stride,
         bit_width=bit_width,
         return_quant_tensor=False,
+        float_to_int_impl_type="FLOOR",
     )
-    quant_avgpool.eval()
+    model_brevitas = torch.nn.Sequential(quant_node, quant_avgpool)
+    model_brevitas.eval()
 
     # determine input
-    prefix = "INT" if signed else "UINT"
-    dt_name = prefix + str(input_bit_width)
-    dtype = DataType[dt_name]
     input_shape = (1, channels, idim, idim)
-    input_array = gen_finn_dt_tensor(dtype, input_shape)
-    # Brevitas QuantAvgPool layers need QuantTensors to export correctly
-    # which requires setting up a QuantTensor instance with the scale
-    # factor, zero point, bitwidth and signedness
-    scale_array = np.ones((1, channels, 1, 1)).astype(np.float32)
-    scale_array *= 0.5
-    input_tensor = torch.from_numpy(input_array * scale_array).float()
-    scale_tensor = torch.from_numpy(scale_array).float()
-    zp = torch.tensor(0.0)
-    input_quant_tensor = QuantTensor(
-        input_tensor, scale_tensor, zp, input_bit_width, signed, training=False
-    )
+    input_array = gen_finn_dt_tensor(DataType["FLOAT32"], input_shape)
 
-    # export
-    if QONNX_export:
-        BrevitasONNXManager.export(
-            quant_avgpool,
-            export_path=export_onnx_path,
-            input_t=input_quant_tensor,
-        )
-        model = ModelWrapper(export_onnx_path)
+    input_tensor = torch.from_numpy(input_array).float()
 
-        # Statically set the additional inputs generated by the BrevitasONNXManager
-        model.graph.input.remove(model.graph.input[3])
-        model.graph.input.remove(model.graph.input[2])
-        model.graph.input.remove(model.graph.input[1])
-        model.set_initializer("1", scale_array)
-        model.set_initializer("2", np.array(0.0).astype(np.float32))
-        model.set_initializer("3", np.array(input_bit_width).astype(np.float32))
-        model.save(export_onnx_path)
+    # export
+    export_qonnx(
+        model_brevitas,
+        export_path=export_onnx_path,
+        input_t=input_tensor,
+    )
+    model = ModelWrapper(export_onnx_path)
+    model.save(export_onnx_path)
 
-        qonnx_cleanup(export_onnx_path, out_file=export_onnx_path)
-        model = ModelWrapper(export_onnx_path)
-        model = model.transform(ConvertQONNXtoFINN())
-        model.save(export_onnx_path)
-    else:
-        FINNManager.export(
-            quant_avgpool, export_path=export_onnx_path, input_t=input_quant_tensor
-        )
+    qonnx_cleanup(export_onnx_path, out_file=export_onnx_path)
     model = ModelWrapper(export_onnx_path)
+    model = model.transform(ConvertQONNXtoFINN())
     model = model.transform(InferShapes())
     model = model.transform(InferDataTypes())
 
     # reference brevitas output
-    ref_output_array = quant_avgpool(input_quant_tensor).detach().numpy()
+    ref_output_array = model_brevitas(input_tensor).detach().numpy()
     # finn output
-    if QONNX_export:
-        # Manually apply the Quant tensor scaling for QONNX
-        idict = {model.graph.input[0].name: input_array * scale_array}
-    else:
-        idict = {model.graph.input[0].name: input_array}
+    idict = {model.graph.input[0].name: input_array}
     odict = oxe.execute_onnx(model, idict, True)
     finn_output = odict[model.graph.output[0].name]
     # compare outputs
     assert np.isclose(ref_output_array, finn_output).all()
     # cleanup
-    # assert False
     os.remove(export_onnx_path)
diff --git a/tests/brevitas/test_brevitas_cnv.py b/tests/brevitas/test_brevitas_cnv.py
index 62aab2e3c2..3950a5b6a7 100644
--- a/tests/brevitas/test_brevitas_cnv.py
+++ b/tests/brevitas/test_brevitas_cnv.py
@@ -26,15 +26,13 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import pkg_resources as pk
-
 import pytest
 
-import brevitas.onnx as bo
+import importlib_resources as importlib
 import numpy as np
 import os
 import torch
-from brevitas.export.onnx.generic.manager import BrevitasONNXManager
+from brevitas.export import export_qonnx
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.transformation.fold_constants import FoldConstants
 from qonnx.transformation.general import GiveUniqueNodeNames, RemoveStaticGraphInputs
@@ -51,29 +49,24 @@
 @pytest.mark.brevitas_export
 @pytest.mark.parametrize("abits", [1, 2])
 @pytest.mark.parametrize("wbits", [1, 2])
-@pytest.mark.parametrize("QONNX_export", [False, True])
-def test_brevitas_cnv_export_exec(wbits, abits, QONNX_export):
+def test_brevitas_cnv_export_exec(wbits, abits):
     if wbits > abits:
         pytest.skip("No wbits > abits cases at the moment")
     cnv = get_test_model_trained("CNV", wbits, abits)
     ishape = (1, 3, 32, 32)
-    if QONNX_export:
-        BrevitasONNXManager.export(cnv, ishape, export_onnx_path)
-        qonnx_cleanup(export_onnx_path, out_file=export_onnx_path)
-        model = ModelWrapper(export_onnx_path)
-        model = model.transform(ConvertQONNXtoFINN())
-        model.save(export_onnx_path)
-    else:
-        bo.export_finn_onnx(cnv, ishape, export_onnx_path)
+    export_qonnx(cnv, torch.randn(ishape), export_onnx_path)
+    qonnx_cleanup(export_onnx_path, out_file=export_onnx_path)
     model = ModelWrapper(export_onnx_path)
+    model = model.transform(ConvertQONNXtoFINN())
     model = model.transform(GiveUniqueNodeNames())
     model = model.transform(InferShapes())
     model = model.transform(FoldConstants())
     model = model.transform(RemoveStaticGraphInputs())
     assert len(model.graph.input) == 1
     assert len(model.graph.output) == 1
-    fn = pk.resource_filename("finn.qnn-data", "cifar10/cifar10-test-data-class3.npz")
-    input_tensor = np.load(fn)["arr_0"].astype(np.float32)
+    ref = importlib.files("finn.qnn-data") / "cifar10/cifar10-test-data-class3.npz"
+    with importlib.as_file(ref) as fn:
+        input_tensor = np.load(fn)["arr_0"].astype(np.float32)
     input_tensor = input_tensor / 255
     assert input_tensor.shape == (1, 3, 32, 32)
     # run using FINN-based execution
diff --git a/tests/brevitas/test_brevitas_debug.py b/tests/brevitas/test_brevitas_debug.py
index 181d610fff..d6879a727b 100644
--- a/tests/brevitas/test_brevitas_debug.py
+++ b/tests/brevitas/test_brevitas_debug.py
@@ -34,12 +34,9 @@
 import onnx.numpy_helper as nph
 import os
 import torch
-from brevitas.export.onnx.generic.manager import BrevitasONNXManager
+from brevitas.export import export_qonnx
 from pkgutil import get_data
 from qonnx.core.modelwrapper import ModelWrapper
-from qonnx.transformation.fold_constants import FoldConstants
-from qonnx.transformation.general import RemoveStaticGraphInputs
-from qonnx.transformation.infer_shapes import InferShapes
 from qonnx.util.cleanup import cleanup as qonnx_cleanup
 
 import finn.core.onnx_exec as oxe
@@ -48,41 +45,23 @@
 
 
 @pytest.mark.brevitas_export
-@pytest.mark.parametrize("QONNX_export", [False, True])
 @pytest.mark.parametrize("QONNX_FINN_conversion", [False, True])
-def test_brevitas_debug(QONNX_export, QONNX_FINN_conversion):
-    if (not QONNX_export) and QONNX_FINN_conversion:
-        pytest.skip("This test configuration is not valid and is thus skipped.")
+def test_brevitas_debug(QONNX_FINN_conversion):
     finn_onnx = "test_brevitas_debug.onnx"
     fc = get_test_model_trained("TFC", 2, 2)
     ishape = (1, 1, 28, 28)
-    if QONNX_export:
-        dbg_hook = bo.enable_debug(fc, proxy_level=True)
-        BrevitasONNXManager.export(fc, ishape, finn_onnx)
-        # DebugMarkers have the brevitas.onnx domain, so that needs adjusting
-        model = ModelWrapper(finn_onnx)
-        dbg_nodes = model.get_nodes_by_op_type("DebugMarker")
-        for dbg_node in dbg_nodes:
-            dbg_node.domain = "qonnx.custom_op.general"
-        model.save(finn_onnx)
-        qonnx_cleanup(finn_onnx, out_file=finn_onnx)
-        if QONNX_FINN_conversion:
-            model = ModelWrapper(finn_onnx)
-            model = model.transform(ConvertQONNXtoFINN())
-            model.save(finn_onnx)
-    else:
-        dbg_hook = bo.enable_debug(fc)
-        bo.export_finn_onnx(fc, ishape, finn_onnx)
+    dbg_hook = bo.enable_debug(fc, proxy_level=True)
+    export_qonnx(fc, torch.randn(ishape), finn_onnx)
+    # DebugMarkers have the brevitas.onnx domain, so that needs adjusting
+    model = ModelWrapper(finn_onnx)
+    dbg_nodes = model.get_nodes_by_op_type("DebugMarker")
+    for dbg_node in dbg_nodes:
+        dbg_node.domain = "qonnx.custom_op.general"
+    model.save(finn_onnx)
+    qonnx_cleanup(finn_onnx, out_file=finn_onnx)
+    if QONNX_FINN_conversion:
         model = ModelWrapper(finn_onnx)
-        # DebugMarkers have the brevitas.onnx domain, so that needs adjusting
-        # ToDo: We should probably have transformation pass, which does this
-        #  domain conversion for us?
-        dbg_nodes = model.get_nodes_by_op_type("DebugMarker")
-        for dbg_node in dbg_nodes:
-            dbg_node.domain = "qonnx.custom_op.general"
-        model = model.transform(InferShapes())
-        model = model.transform(FoldConstants())
-        model = model.transform(RemoveStaticGraphInputs())
+        model = model.transform(ConvertQONNXtoFINN())
         model.save(finn_onnx)
     model = ModelWrapper(finn_onnx)
     assert len(model.graph.input) == 1
@@ -106,17 +85,12 @@ def test_brevitas_debug(QONNX_export, QONNX_FINN_conversion):
     names_common = names_brevitas.intersection(names_finn)
     # The different exports return debug markers in different numbers and places
     print(len(names_common))
-    if QONNX_export and not QONNX_FINN_conversion:
+    if not QONNX_FINN_conversion:
         assert len(names_common) == 12
-    elif QONNX_export and QONNX_FINN_conversion:
-        assert len(names_common) == 8
     else:
-        assert len(names_common) == 16
+        assert len(names_common) == 8
     for dbg_name in names_common:
-        if QONNX_export:
-            tensor_pytorch = dbg_hook.values[dbg_name].value.detach().numpy()
-        else:
-            tensor_pytorch = dbg_hook.values[dbg_name].detach().numpy()
+        tensor_pytorch = dbg_hook.values[dbg_name].value.detach().numpy()
         tensor_finn = output_dict[dbg_name]
         assert np.isclose(tensor_finn, tensor_pytorch, atol=1e-5).all()
     os.remove(finn_onnx)
diff --git a/tests/brevitas/test_brevitas_fc.py b/tests/brevitas/test_brevitas_fc.py
index 211fdb629b..842d099f57 100644
--- a/tests/brevitas/test_brevitas_fc.py
+++ b/tests/brevitas/test_brevitas_fc.py
@@ -28,12 +28,11 @@
 
 import pytest
 
-import brevitas.onnx as bo
 import numpy as np
 import onnx
 import onnx.numpy_helper as nph
 import torch
-from brevitas.export.onnx.generic.manager import BrevitasONNXManager
+from brevitas.export import export_qonnx
 from pkgutil import get_data
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.transformation.fold_constants import FoldConstants
@@ -56,26 +55,19 @@
 @pytest.mark.parametrize("wbits", [1, 2])
 # network topology / size
 @pytest.mark.parametrize("size", ["TFC", "SFC", "LFC"])
-# QONNX export
-@pytest.mark.parametrize("QONNX_export", [False, True])
-def test_brevitas_fc_onnx_export_and_exec(size, wbits, abits, QONNX_export):
+def test_brevitas_fc_onnx_export_and_exec(size, wbits, abits):
     if size == "LFC" and wbits == 2 and abits == 2:
         pytest.skip("No LFC-w2a2 present at the moment")
     if wbits > abits:
         pytest.skip("No wbits > abits cases at the moment")
-    nname = "%s_%dW%dA_QONNX-%d" % (size, wbits, abits, QONNX_export)
+    nname = "%s_%dW%dA" % (size, wbits, abits)
     finn_onnx = export_onnx_path + "/%s.onnx" % nname
     fc = get_test_model_trained(size, wbits, abits)
     ishape = (1, 1, 28, 28)
-    if QONNX_export:
-        BrevitasONNXManager.export(fc, ishape, finn_onnx)
-        qonnx_cleanup(finn_onnx, out_file=finn_onnx)
-        model = ModelWrapper(finn_onnx)
-        model = model.transform(ConvertQONNXtoFINN())
-        model.save(finn_onnx)
-    else:
-        bo.export_finn_onnx(fc, ishape, finn_onnx)
+    export_qonnx(fc, torch.randn(ishape), finn_onnx)
+    qonnx_cleanup(finn_onnx, out_file=finn_onnx)
     model = ModelWrapper(finn_onnx)
+    model = model.transform(ConvertQONNXtoFINN())
     model = model.transform(InferShapes())
     model = model.transform(FoldConstants())
     model = model.transform(RemoveStaticGraphInputs())
diff --git a/tests/brevitas/test_brevitas_mobilenet.py b/tests/brevitas/test_brevitas_mobilenet.py
index b1475b6f4e..be200f6cd4 100644
--- a/tests/brevitas/test_brevitas_mobilenet.py
+++ b/tests/brevitas/test_brevitas_mobilenet.py
@@ -28,9 +28,9 @@
 
 import pytest
 
-import brevitas.onnx as bo
 import numpy as np
 import torch
+from brevitas.export import export_qonnx
 from PIL import Image
 from qonnx.core.datatype import DataType
 from qonnx.core.modelwrapper import ModelWrapper
@@ -45,9 +45,11 @@
 from qonnx.transformation.infer_shapes import InferShapes
 from qonnx.transformation.insert_topk import InsertTopK
 from qonnx.transformation.merge_onnx_models import MergeONNXModels
+from qonnx.util.cleanup import cleanup as qonnx_cleanup
 
 import finn.core.onnx_exec as oxe
 import finn.transformation.streamline.absorb as absorb
+from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN
 from finn.util.basic import get_finn_root, make_build_dir
 from finn.util.pytorch import NormalizePreProc
 from finn.util.test import crop_center, get_test_model_trained, resize_smaller_side
@@ -76,12 +78,12 @@ def test_brevitas_mobilenet():
     std = 0.226
     ch = 3
     preproc = NormalizePreProc(mean, std, ch)
-    bo.export_finn_onnx(preproc, (1, 3, 224, 224), preproc_onnx)
+    export_qonnx(preproc, torch.randn(1, 3, 224, 224), preproc_onnx)
+    qonnx_cleanup(preproc_onnx, out_file=preproc_onnx)
     preproc_model = ModelWrapper(preproc_onnx)
+    preproc_model = preproc_model.transform(ConvertQONNXtoFINN())
     # set input finn datatype to UINT8
-    preproc_model.set_tensor_datatype(
-        preproc_model.graph.input[0].name, DataType["UINT8"]
-    )
+    preproc_model.set_tensor_datatype(preproc_model.graph.input[0].name, DataType["UINT8"])
     preproc_model = preproc_model.transform(InferShapes())
     preproc_model = preproc_model.transform(GiveUniqueNodeNames())
     preproc_model = preproc_model.transform(GiveUniqueParameterTensors())
@@ -89,7 +91,8 @@ def test_brevitas_mobilenet():
 
     finn_onnx = export_onnx_path + "/quant_mobilenet_v1_4b_exported.onnx"
     mobilenet = get_test_model_trained("mobilenet", 4, 4)
-    bo.export_finn_onnx(mobilenet, (1, 3, 224, 224), finn_onnx)
+    export_qonnx(mobilenet, torch.randn(1, 3, 224, 224), finn_onnx)
+    qonnx_cleanup(finn_onnx, out_file=finn_onnx)
 
     # do forward pass in PyTorch/Brevitas
     input_tensor = preproc.forward(img_torch)
@@ -100,7 +103,9 @@ def test_brevitas_mobilenet():
     expected_top5_prob = []
     for index in expected_top5:
         expected_top5_prob.append(expected_topk[index])
+
     model = ModelWrapper(finn_onnx)
+    model = model.transform(ConvertQONNXtoFINN())
     model = model.transform(InferShapes())
     model = model.transform(FoldConstants())
     model = model.transform(InsertTopK())
@@ -121,4 +126,4 @@ def test_brevitas_mobilenet():
     produced = odict[model.graph.output[0].name]
     produced_prob = odict["TopK_0_out0"] * a0
     assert (produced.flatten() == expected_top5).all()
-    assert np.isclose(produced_prob.flatten(), expected_top5_prob).all()
+    assert np.isclose(produced_prob.flatten(), expected_top5_prob, atol=2.2 * 1e-1).all()
diff --git a/tests/brevitas/test_brevitas_non_scaled_quanthardtanh_export.py b/tests/brevitas/test_brevitas_non_scaled_quanthardtanh_export.py
index 5d70acb102..08a193714a 100644
--- a/tests/brevitas/test_brevitas_non_scaled_quanthardtanh_export.py
+++ b/tests/brevitas/test_brevitas_non_scaled_quanthardtanh_export.py
@@ -28,7 +28,6 @@
 
 import pytest
 
-import brevitas.onnx as bo
 import numpy as np
 import onnx  # noqa
 import os
@@ -36,7 +35,7 @@
 from brevitas.core.quant import QuantType
 from brevitas.core.restrict_val import RestrictValueType
 from brevitas.core.scaling import ScalingImplType
-from brevitas.export.onnx.generic.manager import BrevitasONNXManager
+from brevitas.export import export_qonnx
 from brevitas.nn import QuantHardTanh
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.transformation.infer_shapes import InferShapes
@@ -52,10 +51,7 @@
 @pytest.mark.parametrize("abits", [1, 2, 4, 8])
 @pytest.mark.parametrize("narrow_range", [False, True])
 @pytest.mark.parametrize("max_val", [1.0, 1 - 2 ** (-7)])
-@pytest.mark.parametrize("QONNX_export", [False, True])
-def test_brevitas_act_export_qhardtanh_nonscaled(
-    abits, narrow_range, max_val, QONNX_export
-):
+def test_brevitas_act_export_qhardtanh_nonscaled(abits, narrow_range, max_val):
     def get_quant_type(bit_width):
         if bit_width is None:
             return QuantType.FP
@@ -76,20 +72,13 @@ def get_quant_type(bit_width):
         scaling_impl_type=ScalingImplType.CONST,
         narrow_range=narrow_range,
     )
-    if QONNX_export:
-        m_path = export_onnx_path
-        BrevitasONNXManager.export(b_act, ishape, m_path)
-        qonnx_cleanup(m_path, out_file=m_path)
-        model = ModelWrapper(m_path)
-        model = model.transform(ConvertQONNXtoFINN())
-        model.save(m_path)
-    else:
-        bo.export_finn_onnx(b_act, ishape, export_onnx_path)
-    model = ModelWrapper(export_onnx_path)
+    m_path = export_onnx_path
+    export_qonnx(b_act, torch.randn(ishape), m_path)
+    qonnx_cleanup(m_path, out_file=m_path)
+    model = ModelWrapper(m_path)
+    model = model.transform(ConvertQONNXtoFINN())
     model = model.transform(InferShapes())
-    inp_tensor = np.random.uniform(low=min_val, high=max_val, size=ishape).astype(
-        np.float32
-    )
+    inp_tensor = np.random.uniform(low=min_val, high=max_val, size=ishape).astype(np.float32)
     idict = {model.graph.input[0].name: inp_tensor}
     odict = oxe.execute_onnx(model, idict, True)
     produced = odict[model.graph.output[0].name]
diff --git a/tests/brevitas/test_brevitas_qconv2d.py b/tests/brevitas/test_brevitas_qconv2d.py
index 214c55e5fd..4b27671891 100644
--- a/tests/brevitas/test_brevitas_qconv2d.py
+++ b/tests/brevitas/test_brevitas_qconv2d.py
@@ -28,7 +28,6 @@
 
 import pytest
 
-import brevitas.onnx as bo
 import numpy as np
 import os
 import torch
@@ -36,7 +35,7 @@
 from brevitas.core.restrict_val import RestrictValueType
 from brevitas.core.scaling import ScalingImplType
 from brevitas.core.stats import StatsOp
-from brevitas.export.onnx.generic.manager import BrevitasONNXManager
+from brevitas.export import export_qonnx
 from brevitas.nn import QuantConv2d
 from qonnx.core.datatype import DataType
 from qonnx.core.modelwrapper import ModelWrapper
@@ -54,8 +53,7 @@
 @pytest.mark.parametrize("dw", [False, True])
 @pytest.mark.parametrize("bias", [True, False])
 @pytest.mark.parametrize("in_channels", [32])
-@pytest.mark.parametrize("QONNX_export", [False, True])
-def test_brevitas_QConv2d(dw, bias, in_channels, QONNX_export):
+def test_brevitas_QConv2d(dw, bias, in_channels):
     ishape = (1, 32, 111, 111)
     if dw is True:
         groups = in_channels
@@ -94,16 +92,11 @@ def test_brevitas_QConv2d(dw, bias, in_channels, QONNX_export):
     weight_tensor = gen_finn_dt_tensor(DataType["INT4"], w_shape)
     b_conv.weight = torch.nn.Parameter(torch.from_numpy(weight_tensor).float())
     b_conv.eval()
-    if QONNX_export:
-        m_path = export_onnx_path
-        BrevitasONNXManager.export(b_conv, ishape, m_path)
-        qonnx_cleanup(m_path, out_file=m_path)
-        model = ModelWrapper(m_path)
-        model = model.transform(ConvertQONNXtoFINN())
-        model.save(m_path)
-    else:
-        bo.export_finn_onnx(b_conv, ishape, export_onnx_path)
-    model = ModelWrapper(export_onnx_path)
+    m_path = export_onnx_path
+    export_qonnx(b_conv, torch.randn(ishape), m_path)
+    qonnx_cleanup(m_path, out_file=m_path)
+    model = ModelWrapper(m_path)
+    model = model.transform(ConvertQONNXtoFINN())
     model = model.transform(InferShapes())
     inp_tensor = np.random.uniform(low=-1.0, high=1.0, size=ishape).astype(np.float32)
     idict = {model.graph.input[0].name: inp_tensor}
diff --git a/tests/brevitas/test_brevitas_qlinear.py b/tests/brevitas/test_brevitas_qlinear.py
index bcd75a5455..a6ea077e7a 100644
--- a/tests/brevitas/test_brevitas_qlinear.py
+++ b/tests/brevitas/test_brevitas_qlinear.py
@@ -28,12 +28,11 @@
 
 import pytest
 
-import brevitas.onnx as bo
 import numpy as np
 import os
 import torch
 from brevitas.core.quant import QuantType
-from brevitas.export.onnx.generic.manager import BrevitasONNXManager
+from brevitas.export import export_qonnx
 from brevitas.nn import QuantLinear
 from qonnx.core.datatype import DataType
 from qonnx.core.modelwrapper import ModelWrapper
@@ -53,10 +52,7 @@
 @pytest.mark.parametrize("in_features", [3])
 @pytest.mark.parametrize("w_bits", [4])
 @pytest.mark.parametrize("i_dtype", [DataType["UINT4"]])
-@pytest.mark.parametrize("QONNX_export", [False, True])
-def test_brevitas_qlinear(
-    bias, out_features, in_features, w_bits, i_dtype, QONNX_export
-):
+def test_brevitas_qlinear(bias, out_features, in_features, w_bits, i_dtype):
     i_shape = (1, in_features)
     w_shape = (out_features, in_features)
     b_linear = QuantLinear(
@@ -68,21 +64,14 @@ def test_brevitas_qlinear(
         weight_quant_type=QuantType.INT,
         weight_scaling_per_output_channel=True,
     )
-    weight_tensor_fp = np.random.uniform(low=-1.0, high=1.0, size=w_shape).astype(
-        np.float32
-    )
+    weight_tensor_fp = np.random.uniform(low=-1.0, high=1.0, size=w_shape).astype(np.float32)
     b_linear.weight.data = torch.from_numpy(weight_tensor_fp)
     b_linear.eval()
-    if QONNX_export:
-        m_path = export_onnx_path
-        BrevitasONNXManager.export(b_linear, i_shape, m_path)
-        qonnx_cleanup(m_path, out_file=m_path)
-        model = ModelWrapper(m_path)
-        model = model.transform(ConvertQONNXtoFINN())
-        model.save(m_path)
-    else:
-        bo.export_finn_onnx(b_linear, i_shape, export_onnx_path)
-    model = ModelWrapper(export_onnx_path)
+    m_path = export_onnx_path
+    export_qonnx(b_linear, torch.randn(i_shape), m_path)
+    qonnx_cleanup(m_path, out_file=m_path)
+    model = ModelWrapper(m_path)
+    model = model.transform(ConvertQONNXtoFINN())
     model = model.transform(InferShapes())
     inp_tensor = gen_finn_dt_tensor(i_dtype, i_shape)
     idict = {model.graph.input[0].name: inp_tensor}
diff --git a/tests/brevitas/test_brevitas_relu_act_export.py b/tests/brevitas/test_brevitas_relu_act_export.py
index b0c3d6088c..2254670202 100644
--- a/tests/brevitas/test_brevitas_relu_act_export.py
+++ b/tests/brevitas/test_brevitas_relu_act_export.py
@@ -28,15 +28,12 @@
 
 import pytest
 
-import brevitas.onnx as bo
 import numpy as np
 import onnx  # noqa
 import os
 import torch
-from brevitas.core.quant import QuantType
-from brevitas.core.restrict_val import RestrictValueType
 from brevitas.core.scaling import ScalingImplType
-from brevitas.export.onnx.generic.manager import BrevitasONNXManager
+from brevitas.export import export_qonnx
 from brevitas.nn import QuantReLU
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.transformation.infer_shapes import InferShapes
@@ -50,63 +47,27 @@
 
 @pytest.mark.brevitas_export
 @pytest.mark.parametrize("abits", [2, 4, 8])
-@pytest.mark.parametrize("max_val", [1.0, 1.5, 1 - 2 ** (-7)])
-@pytest.mark.parametrize(
-    "scaling_impl_type", [ScalingImplType.CONST, ScalingImplType.PARAMETER]
-)
-@pytest.mark.parametrize("QONNX_export", [False, True])
-def test_brevitas_act_export_relu(abits, max_val, scaling_impl_type, QONNX_export):
-    min_val = -1.0
-    ishape = (1, 15)
-
+@pytest.mark.parametrize("ishape", [(1, 15), (1, 32, 1, 1)])
+def test_brevitas_act_export_relu(
+    abits,
+    ishape,
+):
     b_act = QuantReLU(
         bit_width=abits,
-        max_val=max_val,
-        scaling_impl_type=scaling_impl_type,
-        restrict_scaling_type=RestrictValueType.LOG_FP,
-        quant_type=QuantType.INT,
     )
-    if scaling_impl_type == ScalingImplType.PARAMETER:
-        checkpoint = {
-            "act_quant_proxy.fused_activation_quant_proxy.tensor_quant.\
-scaling_impl.learned_value": torch.tensor(
-                0.49
-            ).type(
-                torch.FloatTensor
-            )
-        }
-        b_act.load_state_dict(checkpoint)
-    if QONNX_export:
-        m_path = export_onnx_path
-        BrevitasONNXManager.export(b_act, ishape, m_path)
-        qonnx_cleanup(m_path, out_file=m_path)
-        model = ModelWrapper(m_path)
-        model = model.transform(ConvertQONNXtoFINN())
-        model.save(m_path)
-    else:
-        bo.export_finn_onnx(b_act, ishape, export_onnx_path)
-    model = ModelWrapper(export_onnx_path)
+    m_path = export_onnx_path
+    export_qonnx(b_act, torch.randn(ishape), m_path)
+    qonnx_cleanup(m_path, out_file=m_path)
+    model = ModelWrapper(m_path)
+    model = model.transform(ConvertQONNXtoFINN())
     model = model.transform(InferShapes())
-    inp_tensor = np.random.uniform(low=min_val, high=max_val, size=ishape).astype(
-        np.float32
-    )
+    inp_tensor = np.random.uniform(low=-1.0, high=6.0, size=ishape).astype(np.float32)
     idict = {model.graph.input[0].name: inp_tensor}
     odict = oxe.execute_onnx(model, idict, True)
     produced = odict[model.graph.output[0].name]
     inp_tensor = torch.from_numpy(inp_tensor).float()
     b_act.eval()
     expected = b_act.forward(inp_tensor).detach().numpy()
-    if not np.isclose(produced, expected, atol=1e-3).all():
-        print(abits, max_val, scaling_impl_type)
-        print("scale: ", b_act.quant_act_scale().type(torch.FloatTensor).detach())
-        if abits < 5:
-            print(
-                "thres:",
-                ", ".join(["{:8.4f}".format(x) for x in b_act.export_thres[0]]),
-            )
-        print("input:", ", ".join(["{:8.4f}".format(x) for x in inp_tensor[0]]))
-        print("prod :", ", ".join(["{:8.4f}".format(x) for x in produced[0]]))
-        print("expec:", ", ".join(["{:8.4f}".format(x) for x in expected[0]]))
 
     assert np.isclose(produced, expected, atol=1e-3).all()
     os.remove(export_onnx_path)
@@ -114,68 +75,32 @@ def test_brevitas_act_export_relu(abits, max_val, scaling_impl_type, QONNX_expor
 
 @pytest.mark.brevitas_export
 @pytest.mark.parametrize("abits", [2, 4, 8])
-@pytest.mark.parametrize("max_val", [1.0, 1.5, 1 - 2 ** (-7)])
-@pytest.mark.parametrize("scaling_per_channel", [True, False])
-@pytest.mark.parametrize("QONNX_export", [False, True])
-def test_brevitas_act_export_relu_imagenet(
-    abits, max_val, scaling_per_channel, QONNX_export
+@pytest.mark.parametrize("ishape", [(1, 15, 4, 4), (1, 32, 1, 1)])
+def test_brevitas_act_export_relu_channel(
+    abits,
+    ishape,
 ):
-    out_channels = 32
-    ishape = (1, out_channels, 1, 1)
-    min_val = -1.0
+    ch = ishape[1]
     b_act = QuantReLU(
         bit_width=abits,
-        quant_type=QuantType.INT,
-        scaling_impl_type=ScalingImplType.PARAMETER,
-        scaling_per_channel=scaling_per_channel,
-        restrict_scaling_type=RestrictValueType.LOG_FP,
-        scaling_min_val=2e-16,
         max_val=6.0,
-        return_quant_tensor=False,
-        per_channel_broadcastable_shape=(1, out_channels, 1, 1),
+        scaling_impl_type=ScalingImplType.CONST,
+        scaling_per_output_channel=True,
+        per_channel_broadcastable_shape=(1, ch, 1, 1),
     )
-    if scaling_per_channel is True:
-        rand_tensor = (2) * torch.rand((1, out_channels, 1, 1))
-    else:
-        rand_tensor = torch.tensor(1.2398)
-    checkpoint = {
-        "act_quant_proxy.fused_activation_quant_proxy.tensor_quant.\
-scaling_impl.learned_value": rand_tensor.type(
-            torch.FloatTensor
-        )
-    }
-    b_act.load_state_dict(checkpoint)
-    if QONNX_export:
-        m_path = export_onnx_path
-        BrevitasONNXManager.export(b_act, ishape, m_path)
-        qonnx_cleanup(m_path, out_file=m_path)
-        model = ModelWrapper(m_path)
-        model = model.transform(ConvertQONNXtoFINN())
-        model.save(m_path)
-    else:
-        bo.export_finn_onnx(b_act, ishape, export_onnx_path)
-    model = ModelWrapper(export_onnx_path)
+    m_path = export_onnx_path
+    export_qonnx(b_act, torch.randn(ishape), m_path)
+    qonnx_cleanup(m_path, out_file=m_path)
+    model = ModelWrapper(m_path)
+    model = model.transform(ConvertQONNXtoFINN())
     model = model.transform(InferShapes())
-    inp_tensor = np.random.uniform(low=min_val, high=max_val, size=ishape).astype(
-        np.float32
-    )
+    inp_tensor = np.random.uniform(low=-1.0, high=6.0, size=ishape).astype(np.float32)
     idict = {model.graph.input[0].name: inp_tensor}
     odict = oxe.execute_onnx(model, idict, True)
     produced = odict[model.graph.output[0].name]
     inp_tensor = torch.from_numpy(inp_tensor).float()
     b_act.eval()
     expected = b_act.forward(inp_tensor).detach().numpy()
-    if not np.isclose(produced, expected, atol=1e-3).all():
-        print(abits, max_val)
-        print("scale: ", b_act.quant_act_scale().type(torch.FloatTensor).detach())
-        if abits < 5:
-            print(
-                "thres:",
-                ", ".join(["{:8.4f}".format(x) for x in b_act.export_thres[0]]),
-            )
-        print("input:", ", ".join(["{:8.4f}".format(x) for x in inp_tensor[0]]))
-        print("prod :", ", ".join(["{:8.4f}".format(x) for x in produced[0]]))
-        print("expec:", ", ".join(["{:8.4f}".format(x) for x in expected[0]]))
 
     assert np.isclose(produced, expected, atol=1e-3).all()
     os.remove(export_onnx_path)
diff --git a/tests/brevitas/test_brevitas_scaled_qhardtanh_export.py b/tests/brevitas/test_brevitas_scaled_qhardtanh_export.py
index 403d406105..e7d87faed8 100644
--- a/tests/brevitas/test_brevitas_scaled_qhardtanh_export.py
+++ b/tests/brevitas/test_brevitas_scaled_qhardtanh_export.py
@@ -28,7 +28,6 @@
 
 import pytest
 
-import brevitas.onnx as bo
 import numpy as np
 import onnx  # noqa
 import os
@@ -36,7 +35,7 @@
 from brevitas.core.quant import QuantType
 from brevitas.core.restrict_val import RestrictValueType
 from brevitas.core.scaling import ScalingImplType
-from brevitas.export.onnx.generic.manager import BrevitasONNXManager
+from brevitas.export import export_qonnx
 from brevitas.nn import QuantHardTanh
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.transformation.infer_shapes import InferShapes
@@ -53,12 +52,9 @@
 @pytest.mark.parametrize("narrow_range", [False, True])
 @pytest.mark.parametrize("min_val", [-1.0, -(1 - 2 ** (-7)), -2])
 @pytest.mark.parametrize("max_val", [1.0, 1 - 2 ** (-7), 2])
-@pytest.mark.parametrize(
-    "scaling_impl_type", [ScalingImplType.CONST, ScalingImplType.PARAMETER]
-)
-@pytest.mark.parametrize("QONNX_export", [False, True])
+@pytest.mark.parametrize("scaling_impl_type", [ScalingImplType.CONST, ScalingImplType.PARAMETER])
 def test_brevitas_act_export_qhardtanh_scaled(
-    abits, narrow_range, min_val, max_val, scaling_impl_type, QONNX_export
+    abits, narrow_range, min_val, max_val, scaling_impl_type
 ):
     def get_quant_type(bit_width):
         if bit_width is None:
@@ -89,20 +85,13 @@ def get_quant_type(bit_width):
             )
         }
         b_act.load_state_dict(checkpoint)
-    if QONNX_export:
-        m_path = export_onnx_path
-        BrevitasONNXManager.export(b_act, ishape, m_path)
-        qonnx_cleanup(m_path, out_file=m_path)
-        model = ModelWrapper(m_path)
-        model = model.transform(ConvertQONNXtoFINN())
-        model.save(m_path)
-    else:
-        bo.export_finn_onnx(b_act, ishape, export_onnx_path)
-    model = ModelWrapper(export_onnx_path)
+    m_path = export_onnx_path
+    export_qonnx(b_act, torch.randn(ishape), m_path)
+    qonnx_cleanup(m_path, out_file=m_path)
+    model = ModelWrapper(m_path)
+    model = model.transform(ConvertQONNXtoFINN())
     model = model.transform(InferShapes())
-    inp_tensor = np.random.uniform(low=min_val, high=max_val, size=ishape).astype(
-        np.float32
-    )
+    inp_tensor = np.random.uniform(low=min_val, high=max_val, size=ishape).astype(np.float32)
     idict = {model.graph.input[0].name: inp_tensor}
     odict = oxe.execute_onnx(model, idict, True)
     produced = odict[model.graph.output[0].name]
diff --git a/tests/brevitas/test_brevitas_selu_act_export.py b/tests/brevitas/test_brevitas_selu_act_export.py
new file mode 100644
index 0000000000..c8d040dbee
--- /dev/null
+++ b/tests/brevitas/test_brevitas_selu_act_export.py
@@ -0,0 +1,72 @@
+# Copyright (c) 2023, Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of Xilinx nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pytest
+
+import numpy as np
+import onnx  # noqa
+import os
+import torch
+from brevitas.export import export_qonnx
+from brevitas.nn import QuantIdentity
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.util.basic import get_preferred_onnx_opset
+from qonnx.util.cleanup import cleanup as qonnx_cleanup
+
+import finn.core.onnx_exec as oxe
+from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN
+
+
+@pytest.mark.brevitas_export
+@pytest.mark.parametrize("abits", [2, 4, 8])
+@pytest.mark.parametrize("ishape", [(1, 15), (1, 32, 1, 1)])
+@pytest.mark.parametrize("narrow", [True, False])
+def test_brevitas_act_export_selu(abits, ishape, narrow):
+    export_path = "test_brevitas_selu_act_export_%s.onnx" % str(abits)
+    b_act = torch.nn.Sequential(torch.nn.SELU(), QuantIdentity(bit_width=abits, narrow=narrow))
+
+    export_qonnx(
+        b_act,
+        torch.randn(ishape),
+        export_path,
+        opset_version=get_preferred_onnx_opset(),
+    )
+    qonnx_cleanup(export_path, out_file=export_path)
+    model = ModelWrapper(export_path)
+    model = model.transform(ConvertQONNXtoFINN())
+
+    inp_tensor = np.random.uniform(low=-1.0, high=6.0, size=ishape).astype(np.float32)
+    idict = {model.graph.input[0].name: inp_tensor}
+    odict = oxe.execute_onnx(model, idict, True)
+    produced = odict[model.graph.output[0].name]
+    inp_tensor = torch.from_numpy(inp_tensor).float()
+    b_act.eval()
+    expected = b_act.forward(inp_tensor).detach().numpy()
+
+    assert np.isclose(produced, expected, atol=1e-3).all()
+    os.remove(export_path)
diff --git a/tests/brevitas/test_brevitas_validate_mobilenet.py b/tests/brevitas/test_brevitas_validate_mobilenet.py
index 55915838e8..18f8fa9a41 100644
--- a/tests/brevitas/test_brevitas_validate_mobilenet.py
+++ b/tests/brevitas/test_brevitas_validate_mobilenet.py
@@ -35,6 +35,7 @@
 import torch
 import torchvision.datasets as datasets
 import torchvision.transforms as transforms
+from brevitas.export import export_qonnx
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.transformation.fold_constants import FoldConstants
 from qonnx.transformation.general import (
@@ -48,10 +49,12 @@
 from qonnx.transformation.infer_shapes import InferShapes
 from qonnx.transformation.insert_topk import InsertTopK
 from qonnx.transformation.merge_onnx_models import MergeONNXModels
+from qonnx.util.cleanup import cleanup as qonnx_cleanup
 
 import finn.core.onnx_exec as oxe
 import finn.transformation.streamline.absorb as absorb
 import finn.util.imagenet as imagenet_util
+from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN
 from finn.util.basic import make_build_dir
 from finn.util.pytorch import NormalizePreProc
 from finn.util.test import get_test_model_trained
@@ -101,9 +104,6 @@ def test_brevitas_mobilenet_preproc():
 
 @pytest.mark.brevitas_export
 @pytest.mark.slow
-# marked as XFAIL until Brevitas export issues are resolved:
-# https://github.com/Xilinx/brevitas/issues/173
-@pytest.mark.xfail
 def test_brevitas_compare_exported_mobilenet():
     if "IMAGENET_VAL_PATH" not in os.environ.keys():
         pytest.skip("Can't do validation without IMAGENET_VAL_PATH")
@@ -113,8 +113,10 @@ def test_brevitas_compare_exported_mobilenet():
     # export preprocessing
     preproc_onnx = export_onnx_path + "/quant_mobilenet_v1_4b_preproc.onnx"
     preproc = NormalizePreProc(mean, std, ch)
-    bo.export_finn_onnx(preproc, (1, 3, 224, 224), preproc_onnx)
+    export_qonnx(preproc, torch.randn(1, 3, 224, 224), preproc_onnx)
+    qonnx_cleanup(preproc_onnx, out_file=preproc_onnx)
     preproc_model = ModelWrapper(preproc_onnx)
+    preproc_model = preproc_model.transform(ConvertQONNXtoFINN())
     preproc_model = preproc_model.transform(InferShapes())
     preproc_model = preproc_model.transform(GiveUniqueNodeNames())
     preproc_model = preproc_model.transform(GiveUniqueParameterTensors())
@@ -124,8 +126,10 @@ def test_brevitas_compare_exported_mobilenet():
     mobilenet = get_test_model_trained("mobilenet", 4, 4)
     if debug_mode:
         dbg_hook = bo.enable_debug(mobilenet)
-    bo.export_finn_onnx(mobilenet, (1, 3, 224, 224), finn_onnx)
+    export_qonnx(mobilenet, torch.randn(1, 3, 224, 224), finn_onnx)
+    qonnx_cleanup(finn_onnx, out_file=finn_onnx)
     model = ModelWrapper(finn_onnx)
+    model = model.transform(ConvertQONNXtoFINN())
     model = model.transform(InferShapes())
     model = model.transform(FoldConstants())
     model = model.transform(RemoveStaticGraphInputs())
@@ -145,9 +149,7 @@ def test_brevitas_compare_exported_mobilenet():
     model = model.transform(MergeONNXModels(preproc_model))
     model.save(export_onnx_path + "/quant_mobilenet_v1_4b.onnx")
 
-    with open(
-        export_onnx_path + "/mobilenet_validation.csv", "w", newline=""
-    ) as csvfile:
+    with open(export_onnx_path + "/mobilenet_validation.csv", "w", newline="") as csvfile:
         writer = csv.writer(csvfile)
         writer.writerow(
             [
@@ -164,7 +166,7 @@ def test_brevitas_compare_exported_mobilenet():
         workload = imagenet_util.get_val_images(n_images, interleave_classes=True)
         all_inds_ok = True
         all_probs_ok = True
-        for (img_path, target_id) in workload:
+        for img_path, target_id in workload:
             img_np = imagenet_util.load_resize_crop(img_path)
             img_torch = torch.from_numpy(img_np).float()
             # do forward pass in PyTorch/Brevitas
diff --git a/tests/end2end/test_end2end_bnn_pynq.py b/tests/end2end/test_end2end_bnn_pynq.py
index 103f18b514..8ac2493d1e 100644
--- a/tests/end2end/test_end2end_bnn_pynq.py
+++ b/tests/end2end/test_end2end_bnn_pynq.py
@@ -28,20 +28,18 @@
 
 import pytest
 
-import brevitas.onnx as bo
+import itertools
 import numpy as np
 
 # as of Feb'20 there is a bug that segfaults ONNX shape inference if we
 # import pytorch before onnx, so we make sure to import onnx first
 import onnx  # NOQA
 import os
-import subprocess
 import torch
 import warnings
-from brevitas.export.onnx.generic.manager import BrevitasONNXManager
-from collections import OrderedDict
+from brevitas.export import export_qonnx
 from dataset_loading import cifar, mnist
-from datetime import datetime
+from distutils.dir_util import copy_tree
 from qonnx.core.datatype import DataType
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.custom_op.registry import getCustomOp
@@ -60,13 +58,13 @@
 from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul
 from qonnx.transformation.merge_onnx_models import MergeONNXModels
 from qonnx.util.cleanup import cleanup as qonnx_cleanup
-from scipy.stats import linregress
+from shutil import copy
 
 import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
 import finn.transformation.streamline.absorb as absorb
 from finn.analysis.fpgadataflow.dataflow_performance import dataflow_performance
 from finn.core.onnx_exec import execute_onnx
-from finn.core.throughput_test import throughput_test_remote, throughput_test_rtlsim
+from finn.core.throughput_test import throughput_test_rtlsim
 from finn.transformation.fpgadataflow.annotate_cycles import AnnotateCycles
 from finn.transformation.fpgadataflow.annotate_resources import AnnotateResources
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
@@ -76,11 +74,15 @@
 from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
 from finn.transformation.fpgadataflow.insert_dwc import InsertDWC
-from finn.transformation.fpgadataflow.make_deployment import DeployToPYNQ
 from finn.transformation.fpgadataflow.make_pynq_driver import MakePYNQDriver
+from finn.transformation.fpgadataflow.minimize_accumulator_width import (
+    MinimizeAccumulatorWidth,
+)
+from finn.transformation.fpgadataflow.minimize_weight_bit_width import (
+    MinimizeWeightBitWidth,
+)
 from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
-from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
 from finn.transformation.fpgadataflow.set_fifo_depths import InsertAndSetFIFODepths
 from finn.transformation.move_reshape import RemoveCNVtoFCFlatten
@@ -90,8 +92,7 @@
     MakeMaxPoolNHWC,
     MoveScalarLinearPastInvariants,
 )
-from finn.util.basic import get_finn_root
-from finn.util.gdrive import upload_to_end2end_dashboard
+from finn.util.basic import get_finn_root, make_build_dir, test_board_map
 from finn.util.pytorch import ToTensor
 from finn.util.test import (
     execute_parent,
@@ -103,39 +104,20 @@
 )
 
 build_dir = os.environ["FINN_BUILD_DIR"]
-target_clk_ns = 10
+target_clk_ns = 20
 mem_mode = "decoupled"
 rtlsim_trace = False
 
 
-def get_checkpoint_name(topology, wbits, abits, QONNX_export, step):
-    return build_dir + "/end2end_%s_w%da%d_QONNX-%d_%s.onnx" % (
+def get_checkpoint_name(topology, wbits, abits, step):
+    return build_dir + "/end2end_%s_w%da%d_%s.onnx" % (
         topology,
         wbits,
         abits,
-        QONNX_export,
         step,
     )
 
 
-def get_dashboard_data(topology, wbits, abits):
-    stats_file = build_dir + "/end2end_%s_w%da%d.txt" % (topology, wbits, abits)
-    stats_dict = OrderedDict()
-    if os.path.isfile(stats_file):
-        with open(stats_file, "r") as f:
-            stats_dict_txt = f.read()
-        stats_dict = eval(stats_dict_txt)
-    return stats_dict
-
-
-def update_dashboard_data(topology, wbits, abits, key, val):
-    stats_dict = get_dashboard_data(topology, wbits, abits)
-    stats_dict[key] = val
-    stats_file = build_dir + "/end2end_%s_w%da%d.txt" % (topology, wbits, abits)
-    with open(stats_file, "w") as f:
-        f.write(str(stats_dict))
-
-
 def fold_tfc(model):
     fc_layers = model.get_nodes_by_op_type("MatrixVectorActivation")
     # (PE, SIMD, ramstyle) for each layer
@@ -270,7 +252,7 @@ def measure_top1_accuracy(model_chkpt, dataset, parent_chkpt=None):
         raise Exception("Unrecognized dataset")
     # move from dataset_loader layout to ONNX layout: NHWC -> NCHW
     testx = testx.transpose(0, 3, 1, 2)
-    model = ModelWrapper(model_chkpt)
+    model = load_test_checkpoint_or_skip(model_chkpt)
     iname = model.graph.input[0].name
     oname = model.graph.output[0].name
     if parent_chkpt is None:
@@ -310,42 +292,179 @@ def topology2dataset(topology):
         raise Exception("Unrecognized topology")
 
 
-@pytest.mark.parametrize("wbits", [1, 2])
-@pytest.mark.parametrize("abits", [1, 2])
-@pytest.mark.parametrize("topology", ["lfc", "tfc", "cnv"])
-@pytest.mark.parametrize("QONNX_export", [False, True])
-@pytest.mark.end2end
+def deploy_based_on_board(model, model_title, topology, wbits, abits, board):
+    # Check if a deployment directory for this board type already exists
+    if ("FINN_DEPLOY_DIR" in os.environ) and (board in os.environ["FINN_DEPLOY_DIR"]):
+        deploy_dir_root = os.environ["FINN_DEPLOY_DIR"]
+    else:
+        deploy_dir_root = make_build_dir(prefix="hw_deployment_" + board + "_")
+        # Set it for the next round if multiple bitstreams are selected for generation
+        os.environ["FINN_DEPLOY_DIR"] = deploy_dir_root
+
+    # create directory for deployment files
+    deployment_dir = deploy_dir_root + "/" + board + "/" + model_title
+    os.makedirs(deployment_dir)
+    model.set_metadata_prop("pynq_deployment_dir", deployment_dir)
+
+    # get and copy necessary files
+    # .bit and .hwh file
+    bitfile = model.get_metadata_prop("bitfile")
+    hwh_file = model.get_metadata_prop("hw_handoff")
+    deploy_files = [bitfile, hwh_file]
+
+    for dfile in deploy_files:
+        if dfile is not None:
+            copy(dfile, deployment_dir)
+
+    # create input and output test files
+    (input_tensor_npy, output_tensor_npy) = get_golden_io_pair(
+        topology, wbits, abits, return_topk=1
+    )
+
+    # Some changes are required in order to prepare the input tensor data for hardware
+    # testing. The ONNX graphs for these models contain nodes that manipulate the input
+    # tensor shape which FINN considers when creating the model. The same input tensor
+    # shaping needs to be done here on the input data.
+    # For the convolutional models, the graph contains the Transpose node. The Brevitas
+    # model works in NCHW layout but the FINN kernels are optimized for NHWC.
+    # The FC models contain a Reshape node, which FINN uses, so we therefore have to
+    # reshape the input tensor data to match the reshaping in the model
+    if topology == "cnv":
+        input_tensor_npy = input_tensor_npy.transpose(0, 2, 3, 1)
+    else:
+        input_shape = input_tensor_npy.shape
+        new_input_shape = (input_shape[0], np.prod(input_shape[1:]))
+        input_tensor_npy = input_tensor_npy.reshape(new_input_shape)
+
+    np.save(os.path.join(deployment_dir, "input.npy"), input_tensor_npy.copy())
+    np.save(os.path.join(deployment_dir, "output_reference.npy"), output_tensor_npy)
+
+    # driver.py and python libraries
+    pynq_driver_dir = model.get_metadata_prop("pynq_driver_dir")
+    copy_tree(pynq_driver_dir, deployment_dir)
+    model.set_metadata_prop("pynq_deploy_dir", deployment_dir)
+
+
+# parameters that make up inputs to test case(s)
+def get_full_parameterized_test_list(marker, wbits_list, abits_list, topology_list, board_list):
+    test_cases = [
+        (
+            f"{marker}_w{param1}_a{param2}_{param3}_{param4}",
+            {
+                "wbits": param1,
+                "abits": param2,
+                "topology": param3,
+                "board": param4,
+            },
+        )
+        for param1, param2, param3, param4 in itertools.product(
+            wbits_list,
+            abits_list,
+            topology_list,
+            board_list,
+        )
+    ]
+    return test_cases
+
+
+def pytest_generate_tests(metafunc):
+    idlist = []
+    argvalues = []
+    scenarios = []
+
+    # Full set of test parameters
+    wbits = [1, 2]
+    abits = [1, 2]
+    topology = ["lfc", "tfc", "cnv"]
+
+    # Separate the full list of markers used on command line.
+    # This allows a user to select multiple markers
+    all_markers_used = metafunc.config.getoption("-m").split(" ")
+
+    for marker in all_markers_used:
+        if "sanity_bnn" in marker:
+            # Define a set of sanity tests that target each of
+            # the supported boards with fixed parameters
+            scenarios.extend(
+                get_full_parameterized_test_list(
+                    "sanity_bnn",
+                    wbits_list=[1],
+                    abits_list=[1],
+                    topology_list=["lfc"],
+                    board_list=[test_board_map[0]],
+                )
+            )
+            scenarios.extend(
+                get_full_parameterized_test_list(
+                    "sanity_bnn",
+                    wbits_list=[1],
+                    abits_list=[2],
+                    topology_list=["cnv"],
+                    board_list=[test_board_map[1]],
+                )
+            )
+            scenarios.extend(
+                get_full_parameterized_test_list(
+                    "sanity_bnn",
+                    wbits_list=[2],
+                    abits_list=[2],
+                    topology_list=["tfc"],
+                    board_list=[test_board_map[2]],
+                )
+            )
+            scenarios.extend(
+                get_full_parameterized_test_list(
+                    "sanity_bnn",
+                    wbits_list=[2],
+                    abits_list=[2],
+                    topology_list=["cnv"],
+                    board_list=[test_board_map[3]],
+                )
+            )
+
+        if "bnn_" in marker:
+            # Target the full set of parameters for a single board
+            # Extract the board name from the marker used, as it is in the form of 'bnn_<board>'
+            bnn_board = next(
+                (element for element in test_board_map if marker.split("_")[1] in element.lower()),
+                None,
+            )
+            test_cases = get_full_parameterized_test_list(
+                "bnn", wbits, abits, topology, [bnn_board]
+            )
+            scenarios.extend(test_cases)
+
+    if len(scenarios) > 0:
+        for scenario in scenarios:
+            idlist.append(scenario[0])
+            items = scenario[1].items()
+            argnames = [x[0] for x in items]
+            argvalues.append([x[1] for x in items])
+        metafunc.parametrize(argnames, argvalues, ids=idlist, scope="class")
+
+
+@pytest.mark.sanity_bnn
+@pytest.mark.bnn_pynq
+@pytest.mark.bnn_zcu104
+@pytest.mark.bnn_kv260
+@pytest.mark.bnn_u250
 class TestEnd2End:
-    def test_export(self, topology, wbits, abits, QONNX_export):
+    def test_export(self, topology, wbits, abits, board):
         if wbits > abits:
             pytest.skip("No wbits > abits end2end network configs for now")
         if topology == "lfc" and not (wbits == 1 and abits == 1):
             pytest.skip("Skipping certain lfc configs")
         (model, ishape) = get_trained_network_and_ishape(topology, wbits, abits)
-        chkpt_name = get_checkpoint_name(topology, wbits, abits, QONNX_export, "export")
-        if QONNX_export:
-            BrevitasONNXManager.export(model, ishape, chkpt_name)
-            qonnx_cleanup(chkpt_name, out_file=chkpt_name)
-            model = ModelWrapper(chkpt_name)
-            model = model.transform(ConvertQONNXtoFINN())
-            model.save(chkpt_name)
-        else:
-            bo.export_finn_onnx(model, ishape, chkpt_name)
-        nname = "%s_w%da%d" % (topology, wbits, abits)
-        update_dashboard_data(topology, wbits, abits, "network", nname)
-        dtstr = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-        update_dashboard_data(topology, wbits, abits, "datetime", dtstr)
-        finn_commit = subprocess.check_output(
-            ["git", "rev-parse", "HEAD"], cwd=get_finn_root()
-        )
-        finn_commit = finn_commit.decode("utf-8").strip()
-        update_dashboard_data(topology, wbits, abits, "finn-commit", finn_commit)
+        chkpt_name = get_checkpoint_name(topology, wbits, abits, "export")
+        export_qonnx(model, torch.randn(ishape), chkpt_name, opset_version=13)
+        qonnx_cleanup(chkpt_name, out_file=chkpt_name)
+        model = ModelWrapper(chkpt_name)
+        model = model.transform(ConvertQONNXtoFINN())
+        model.save(chkpt_name)
         assert os.path.isfile(chkpt_name)
 
-    def test_import_and_tidy(self, topology, wbits, abits, QONNX_export):
-        prev_chkpt_name = get_checkpoint_name(
-            topology, wbits, abits, QONNX_export, "export"
-        )
+    def test_import_and_tidy(self, topology, wbits, abits, board):
+        prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "export")
         model = load_test_checkpoint_or_skip(prev_chkpt_name)
         model = model.transform(InferShapes())
         model = model.transform(FoldConstants())
@@ -353,24 +472,22 @@ def test_import_and_tidy(self, topology, wbits, abits, QONNX_export):
         model = model.transform(GiveReadableTensorNames())
         model = model.transform(InferDataTypes())
         model = model.transform(RemoveStaticGraphInputs())
-        chkpt = get_checkpoint_name(
-            topology, wbits, abits, QONNX_export, "import_and_tidy"
-        )
+        chkpt = get_checkpoint_name(topology, wbits, abits, "import_and_tidy")
         model.save(chkpt)
 
-    def test_add_pre_and_postproc(self, topology, wbits, abits, QONNX_export):
-        prev_chkpt_name = get_checkpoint_name(
-            topology, wbits, abits, QONNX_export, "import_and_tidy"
-        )
+    def test_add_pre_and_postproc(self, topology, wbits, abits, board):
+        prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "import_and_tidy")
         model = load_test_checkpoint_or_skip(prev_chkpt_name)
         global_inp_name = model.graph.input[0].name
         ishape = model.get_tensor_shape(global_inp_name)
         # preprocessing: torchvision's ToTensor divides uint8 inputs by 255
         totensor_pyt = ToTensor()
-        chkpt_preproc_name = get_checkpoint_name(
-            topology, wbits, abits, QONNX_export, "preproc"
-        )
-        bo.export_finn_onnx(totensor_pyt, ishape, chkpt_preproc_name)
+        chkpt_preproc_name = get_checkpoint_name(topology, wbits, abits, "preproc")
+        export_qonnx(totensor_pyt, torch.randn(ishape), chkpt_preproc_name, opset_version=13)
+        qonnx_cleanup(chkpt_preproc_name, out_file=chkpt_preproc_name)
+        pre_model = ModelWrapper(chkpt_preproc_name)
+        pre_model = pre_model.transform(ConvertQONNXtoFINN())
+        pre_model.save(chkpt_preproc_name)
         assert os.path.isfile(chkpt_preproc_name)
         # join preprocessing and core model
         pre_model = ModelWrapper(chkpt_preproc_name)
@@ -382,9 +499,7 @@ def test_add_pre_and_postproc(self, topology, wbits, abits, QONNX_export):
         model.set_tensor_datatype(global_inp_name, DataType["UINT8"])
         # postprocessing: insert Top-1 node at the end
         model = model.transform(InsertTopK(k=1))
-        chkpt_name = get_checkpoint_name(
-            topology, wbits, abits, QONNX_export, "pre_post"
-        )
+        chkpt_name = get_checkpoint_name(topology, wbits, abits, "pre_post")
         # tidy-up again
         model = model.transform(InferShapes())
         model = model.transform(FoldConstants())
@@ -395,10 +510,8 @@ def test_add_pre_and_postproc(self, topology, wbits, abits, QONNX_export):
         model.save(chkpt_name)
         assert os.path.isfile(chkpt_name)
 
-    def test_streamline(self, topology, wbits, abits, QONNX_export):
-        prev_chkpt_name = get_checkpoint_name(
-            topology, wbits, abits, QONNX_export, "pre_post"
-        )
+    def test_streamline(self, topology, wbits, abits, board):
+        prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "pre_post")
         model = load_test_checkpoint_or_skip(prev_chkpt_name)
         model = model.transform(absorb.AbsorbSignBiasIntoMultiThreshold())
         # move past any reshapes to be able to streamline input scaling
@@ -414,14 +527,10 @@ def test_streamline(self, topology, wbits, abits, QONNX_export):
         model = model.transform(absorb.AbsorbScalarMulAddIntoTopK())
         model = model.transform(InferDataLayouts())
         model = model.transform(RemoveUnusedTensors())
-        model.save(
-            get_checkpoint_name(topology, wbits, abits, QONNX_export, "streamline")
-        )
+        model.save(get_checkpoint_name(topology, wbits, abits, "streamline"))
 
-    def test_convert_to_hls_layers(self, topology, wbits, abits, QONNX_export):
-        prev_chkpt_name = get_checkpoint_name(
-            topology, wbits, abits, QONNX_export, "streamline"
-        )
+    def test_convert_to_hls_layers(self, topology, wbits, abits, board):
+        prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "streamline")
         model = load_test_checkpoint_or_skip(prev_chkpt_name)
         if topology == "tfc" and wbits == 1 and abits == 1:
             # use standalone thresholds for tfc-w1a1 to also exercise that option
@@ -443,11 +552,7 @@ def test_convert_to_hls_layers(self, topology, wbits, abits, QONNX_export):
         model = model.transform(absorb.AbsorbConsecutiveTransposes())
         model = model.transform(GiveUniqueNodeNames())
         model = model.transform(InferDataLayouts())
-        model.save(
-            get_checkpoint_name(
-                topology, wbits, abits, QONNX_export, "convert_to_hls_layers"
-            )
-        )
+        model.save(get_checkpoint_name(topology, wbits, abits, "convert_to_hls_layers"))
         exp_layer_counts = {
             "tfc": [
                 ("Reshape", 1),
@@ -481,54 +586,48 @@ def test_convert_to_hls_layers(self, topology, wbits, abits, QONNX_export):
         else:
             exp_key = topology
         exp_layer_counts = exp_layer_counts[exp_key]
-        for (op_type, exp_count) in exp_layer_counts:
+        for op_type, exp_count in exp_layer_counts:
             assert len(model.get_nodes_by_op_type(op_type)) == exp_count
 
-    def test_create_dataflow_partition(self, topology, wbits, abits, QONNX_export):
-        prev_chkpt_name = get_checkpoint_name(
-            topology, wbits, abits, QONNX_export, "convert_to_hls_layers"
-        )
+    def test_create_dataflow_partition(self, topology, wbits, abits, board):
+        prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "convert_to_hls_layers")
         model = load_test_checkpoint_or_skip(prev_chkpt_name)
         parent_model = model.transform(CreateDataflowPartition())
-        parent_model_chkpt = get_checkpoint_name(
-            topology, wbits, abits, QONNX_export, "dataflow_parent"
-        )
+        parent_model_chkpt = get_checkpoint_name(topology, wbits, abits, "dataflow_parent")
         parent_model.save(parent_model_chkpt)
         sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0]
         sdp_node = getCustomOp(sdp_node)
         dataflow_model_filename = sdp_node.get_nodeattr("model")
         dataflow_model = load_test_checkpoint_or_skip(dataflow_model_filename)
-        dataflow_model_chkpt = get_checkpoint_name(
-            topology, wbits, abits, QONNX_export, "dataflow_model"
-        )
+        dataflow_model_chkpt = get_checkpoint_name(topology, wbits, abits, "dataflow_model")
         dataflow_model.save(dataflow_model_chkpt)
 
-    def test_fold(self, topology, wbits, abits, QONNX_export):
-        prev_chkpt_name = get_checkpoint_name(
-            topology, wbits, abits, QONNX_export, "dataflow_model"
-        )
+    def test_fold(self, topology, wbits, abits, board):
+        prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "dataflow_model")
         model = load_test_checkpoint_or_skip(prev_chkpt_name)
         folding_fxn = get_folding_function(topology, wbits, abits)
         model = folding_fxn(model)
-        model.save(get_checkpoint_name(topology, wbits, abits, QONNX_export, "fold"))
+        model.save(get_checkpoint_name(topology, wbits, abits, "fold"))
+
+    def test_minimize_bit_width(self, topology, wbits, abits, board):
+        prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "fold")
+        model = load_test_checkpoint_or_skip(prev_chkpt_name)
+        model = model.transform(MinimizeAccumulatorWidth())
+        model = model.transform(MinimizeWeightBitWidth())
+        curr_chkpt_name = get_checkpoint_name(topology, wbits, abits, "minimize_bit_width")
+        model.save(curr_chkpt_name)
 
     @pytest.mark.slow
     @pytest.mark.vivado
-    def test_cppsim(self, topology, wbits, abits, QONNX_export):
-        prev_chkpt_name = get_checkpoint_name(
-            topology, wbits, abits, QONNX_export, "fold"
-        )
+    def test_cppsim(self, topology, wbits, abits, board):
+        prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "minimize_bit_width")
         model = load_test_checkpoint_or_skip(prev_chkpt_name)
         model = model.transform(PrepareCppSim())
         model = model.transform(CompileCppSim())
         model = model.transform(SetExecMode("cppsim"))
-        cppsim_chkpt = get_checkpoint_name(
-            topology, wbits, abits, QONNX_export, "cppsim"
-        )
+        cppsim_chkpt = get_checkpoint_name(topology, wbits, abits, "cppsim")
         model.save(cppsim_chkpt)
-        parent_chkpt = get_checkpoint_name(
-            topology, wbits, abits, QONNX_export, "dataflow_parent"
-        )
+        parent_chkpt = get_checkpoint_name(topology, wbits, abits, "dataflow_parent")
         (input_tensor_npy, output_tensor_npy) = get_golden_io_pair(
             topology, wbits, abits, return_topk=1
         )
@@ -537,55 +636,34 @@ def test_cppsim(self, topology, wbits, abits, QONNX_export):
 
     @pytest.mark.slow
     @pytest.mark.vivado
-    @pytest.mark.parametrize("kind", ["zynq", "alveo"])
-    def test_ipgen(self, topology, wbits, abits, QONNX_export, kind):
-        if kind == "alveo" and ("VITIS_PATH" not in os.environ):
+    def test_ipgen(self, topology, wbits, abits, board):
+        build_data = get_build_env(board, target_clk_ns)
+        if build_data["kind"] == "alveo" and ("VITIS_PATH" not in os.environ):
             pytest.skip("VITIS_PATH not set")
-        prev_chkpt_name = get_checkpoint_name(
-            topology, wbits, abits, QONNX_export, "fold"
-        )
+        prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "fold")
         model = load_test_checkpoint_or_skip(prev_chkpt_name)
-        test_fpga_part = get_build_env(kind, target_clk_ns)["part"]
         model = model.transform(GiveUniqueNodeNames())
-        model = model.transform(PrepareIP(test_fpga_part, target_clk_ns))
+        model = model.transform(PrepareIP(build_data["part"], target_clk_ns))
         model = model.transform(HLSSynthIP())
-        model.save(
-            get_checkpoint_name(topology, wbits, abits, QONNX_export, "ipgen_" + kind)
-        )
+        model.save(get_checkpoint_name(topology, wbits, abits, "ipgen_" + board))
 
     @pytest.mark.slow
     @pytest.mark.vivado
-    @pytest.mark.parametrize("kind", ["zynq", "alveo"])
-    def test_set_fifo_depths(self, topology, wbits, abits, QONNX_export, kind):
-        prev_chkpt_name = get_checkpoint_name(
-            topology, wbits, abits, QONNX_export, "ipgen_" + kind
-        )
+    def test_set_fifo_depths(self, topology, wbits, abits, board):
+        prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "ipgen_" + board)
         model = load_test_checkpoint_or_skip(prev_chkpt_name)
-        test_fpga_part = get_build_env(kind, target_clk_ns)["part"]
+        test_fpga_part = get_build_env(board, target_clk_ns)["part"]
         model = model.transform(InsertAndSetFIFODepths(test_fpga_part, target_clk_ns))
         fifo_layers = model.get_nodes_by_op_type("StreamingFIFO")
         assert len(fifo_layers) > 0
-        hls_layers = model.get_finn_nodes()
-        for node in hls_layers:
-            if node.op_type != "StreamingFIFO":
-                op_inst = getCustomOp(node)
-                assert op_inst.get_nodeattr("inFIFODepth") == 0
-                assert op_inst.get_nodeattr("outFIFODepth") == 0
-        model.save(
-            get_checkpoint_name(
-                topology, wbits, abits, QONNX_export, "fifodepth_" + kind
-            )
-        )
+        model.save(get_checkpoint_name(topology, wbits, abits, "fifodepth_" + board))
 
     @pytest.mark.slow
     @pytest.mark.vivado
-    @pytest.mark.parametrize("kind", ["zynq"])
-    def test_ipstitch_rtlsim(self, topology, wbits, abits, QONNX_export, kind):
-        prev_chkpt_name = get_checkpoint_name(
-            topology, wbits, abits, QONNX_export, "fifodepth_" + kind
-        )
+    def test_ipstitch_rtlsim(self, topology, wbits, abits, board):
+        prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "fifodepth_" + board)
         model = load_test_checkpoint_or_skip(prev_chkpt_name)
-        test_fpga_part = get_build_env(kind, target_clk_ns)["part"]
+        test_fpga_part = get_build_env(board, target_clk_ns)["part"]
         model = model.transform(InsertDWC())
         model = model.transform(GiveUniqueNodeNames())
         model = model.transform(AnnotateCycles())
@@ -597,21 +675,14 @@ def test_ipstitch_rtlsim(self, topology, wbits, abits, QONNX_export, kind):
         model = model.transform(PrepareIP(test_fpga_part, target_clk_ns))
         model = model.transform(HLSSynthIP())
         model = model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns))
-        model = model.transform(PrepareRTLSim())
         model.set_metadata_prop("exec_mode", "rtlsim")
         os.environ["LIVENESS_THRESHOLD"] = str(int(latency * 1.1))
         if rtlsim_trace:
-            model.set_metadata_prop(
-                "rtlsim_trace", "%s_w%da%d.vcd" % (topology, wbits, abits)
-            )
+            model.set_metadata_prop("rtlsim_trace", "%s_w%da%d.vcd" % (topology, wbits, abits))
             os.environ["RTLSIM_TRACE_DEPTH"] = "3"
-        rtlsim_chkpt = get_checkpoint_name(
-            topology, wbits, abits, QONNX_export, "ipstitch_rtlsim_" + kind
-        )
+        rtlsim_chkpt = get_checkpoint_name(topology, wbits, abits, "ipstitch_rtlsim_" + board)
         model.save(rtlsim_chkpt)
-        parent_chkpt = get_checkpoint_name(
-            topology, wbits, abits, QONNX_export, "dataflow_parent"
-        )
+        parent_chkpt = get_checkpoint_name(topology, wbits, abits, "dataflow_parent")
         (input_tensor_npy, output_tensor_npy) = get_golden_io_pair(
             topology, wbits, abits, return_topk=1
         )
@@ -620,11 +691,8 @@ def test_ipstitch_rtlsim(self, topology, wbits, abits, QONNX_export, kind):
 
     @pytest.mark.slow
     @pytest.mark.vivado
-    @pytest.mark.parametrize("kind", ["zynq"])
-    def test_throughput_rtlsim(self, topology, wbits, abits, QONNX_export, kind):
-        prev_chkpt_name = get_checkpoint_name(
-            topology, wbits, abits, QONNX_export, "ipstitch_rtlsim_" + kind
-        )
+    def test_throughput_rtlsim(self, topology, wbits, abits, board):
+        prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "ipstitch_rtlsim_" + board)
         model = load_test_checkpoint_or_skip(prev_chkpt_name)
         n_nodes = len(model.graph.node)
         perf_est = model.analysis(dataflow_performance)
@@ -635,33 +703,18 @@ def test_throughput_rtlsim(self, topology, wbits, abits, QONNX_export, kind):
         ret = throughput_test_rtlsim(model, batchsize=batchsize)
         res_cycles = ret["cycles"]
         est_cycles = latency + cycles_per_sample_est * batchsize
-        # warnings.warn("Estimated & rtlsim performance: " + str(perf))
-        # for (k, v) in perf.items():
-        #    update_dashboard_data(topology, wbits, abits, k, v)
-        update_dashboard_data(topology, wbits, abits, "cycles_rtlsim", latency)
         assert (abs(res_cycles - est_cycles) / res_cycles) < 0.15
 
     @pytest.mark.slow
     @pytest.mark.vivado
-    @pytest.mark.parametrize("kind", ["zynq"])
-    def test_validate_top1(self, topology, wbits, abits, QONNX_export, kind):
+    def test_validate_top1(self, topology, wbits, abits, board):
         if "TEST_END2END_VALIDATE_TOP1" not in os.environ:
             pytest.skip("TEST_END2END_VALIDATE_TOP1 not set")
-        prepostproc_chkpt = get_checkpoint_name(
-            topology, wbits, abits, QONNX_export, "pre_post"
-        )
-        streamline_chkpt = get_checkpoint_name(
-            topology, wbits, abits, QONNX_export, "streamline"
-        )
-        parent_chkpt = get_checkpoint_name(
-            topology, wbits, abits, QONNX_export, "dataflow_parent"
-        )
-        cppsim_chkpt = get_checkpoint_name(
-            topology, wbits, abits, QONNX_export, "cppsim"
-        )
-        rtlsim_chkpt = get_checkpoint_name(
-            topology, wbits, abits, QONNX_export, "ipstitch_rtlsim_" + kind
-        )
+        prepostproc_chkpt = get_checkpoint_name(topology, wbits, abits, "pre_post")
+        streamline_chkpt = get_checkpoint_name(topology, wbits, abits, "streamline")
+        parent_chkpt = get_checkpoint_name(topology, wbits, abits, "dataflow_parent")
+        cppsim_chkpt = get_checkpoint_name(topology, wbits, abits, "cppsim")
+        rtlsim_chkpt = get_checkpoint_name(topology, wbits, abits, "ipstitch_rtlsim_" + board)
         dataset = topology2dataset(topology)
         assert measure_top1_accuracy(prepostproc_chkpt, dataset) > 80
         assert measure_top1_accuracy(streamline_chkpt, dataset) > 80
@@ -671,156 +724,46 @@ def test_validate_top1(self, topology, wbits, abits, QONNX_export, kind):
     @pytest.mark.slow
     @pytest.mark.vivado
     @pytest.mark.vitis
-    @pytest.mark.parametrize("kind", ["zynq", "alveo"])
-    def test_build(self, topology, wbits, abits, QONNX_export, kind):
-        if kind == "alveo" and ("VITIS_PATH" not in os.environ):
+    def test_build(self, topology, wbits, abits, board):
+        build_data = get_build_env(board, target_clk_ns)
+        if build_data["kind"] == "alveo" and ("VITIS_PATH" not in os.environ):
             pytest.skip("VITIS_PATH not set")
-        prev_chkpt_name = get_checkpoint_name(
-            topology, wbits, abits, QONNX_export, "fifodepth_" + kind
-        )
+        if board == "U250" and wbits == 1 and abits == 1:
+            if topology == "lfc" or topology == "tfc":
+                pytest.xfail(
+                    "bnn_w"
+                    + str(wbits)
+                    + "_a"
+                    + str(abits)
+                    + "_"
+                    + topology
+                    + "_"
+                    + board
+                    + " test_build currently disabled, see CR-1171874"
+                )
+        prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "fifodepth_" + board)
         model = load_test_checkpoint_or_skip(prev_chkpt_name)
-        cfg = get_build_env(kind, target_clk_ns)
-        model = model.transform(cfg["build_fxn"])
+        model = model.transform(build_data["build_fxn"])
         model = model.transform(AnnotateResources("synth"))
-        synth_dct = eval(model.get_metadata_prop("res_total_top_synth"))
-        for (k, v) in synth_dct.items():
-            update_dashboard_data(topology, wbits, abits, k, v)
-        update_dashboard_data(topology, wbits, abits, "board", cfg["board"])
-        model.save(
-            get_checkpoint_name(topology, wbits, abits, QONNX_export, "build_" + kind)
-        )
+        model.save(get_checkpoint_name(topology, wbits, abits, "build_" + board))
 
     @pytest.mark.slow
     @pytest.mark.vivado
     @pytest.mark.vitis
-    @pytest.mark.parametrize("kind", ["zynq", "alveo"])
-    def test_make_pynq_driver(self, topology, wbits, abits, QONNX_export, kind):
-        if kind == "alveo" and ("VITIS_PATH" not in os.environ):
+    def test_make_pynq_driver(self, topology, wbits, abits, board):
+        build_data = get_build_env(board, target_clk_ns)
+        if build_data["kind"] == "alveo" and ("VITIS_PATH" not in os.environ):
             pytest.skip("VITIS_PATH not set")
-        prev_chkpt_name = get_checkpoint_name(
-            topology, wbits, abits, QONNX_export, "build_" + kind
-        )
+        prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "build_" + board)
         model = load_test_checkpoint_or_skip(prev_chkpt_name)
-        kind_to_driver_platform = {"zynq": "zynq-iodma", "alveo": "alveo"}
-        model = model.transform(MakePYNQDriver(kind_to_driver_platform[kind]))
-        model.save(
-            get_checkpoint_name(topology, wbits, abits, QONNX_export, "driver_" + kind)
-        )
+        board_to_driver_platform = "alveo" if build_data["kind"] == "alveo" else "zynq-iodma"
+        model = model.transform(MakePYNQDriver(board_to_driver_platform))
+        model.save(get_checkpoint_name(topology, wbits, abits, "driver_" + board))
 
-    @pytest.mark.parametrize("kind", ["zynq", "alveo"])
-    def test_deploy(self, topology, wbits, abits, QONNX_export, kind):
-        prev_chkpt_name = get_checkpoint_name(
-            topology, wbits, abits, QONNX_export, "driver_" + kind
-        )
+    def test_deploy(self, topology, wbits, abits, board):
+        prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "driver_" + board)
         model = load_test_checkpoint_or_skip(prev_chkpt_name)
-        cfg = get_build_env(kind, target_clk_ns)
-        if cfg["ip"] == "":
-            pytest.skip("PYNQ board IP address not specified")
-        model = model.transform(
-            DeployToPYNQ(
-                cfg["ip"],
-                cfg["port"],
-                cfg["username"],
-                cfg["password"],
-                cfg["target_dir"],
-            )
-        )
+        model_title = "%s_w%d_a%d_%s" % ("bnn", wbits, abits, topology)
+        deploy_based_on_board(model, model_title, topology, wbits, abits, board)
         # save the model to be able to link it to the parent
-        model.save(
-            get_checkpoint_name(topology, wbits, abits, QONNX_export, "deploy_" + kind)
-        )
-
-    @pytest.mark.parametrize("kind", ["zynq", "alveo"])
-    def test_run_on_hw(self, topology, wbits, abits, QONNX_export, kind):
-        prev_chkpt_name = get_checkpoint_name(
-            topology, wbits, abits, QONNX_export, "deploy_" + kind
-        )
-        model = load_test_checkpoint_or_skip(prev_chkpt_name)  # NOQA
-        cfg = get_build_env(kind, target_clk_ns)
-        if cfg["ip"] == "":
-            pytest.skip("PYNQ board IP address not specified")
-        (input_tensor_npy, output_tensor_npy) = get_golden_io_pair(
-            topology, wbits, abits, return_topk=1
-        )
-        parent_model = load_test_checkpoint_or_skip(
-            get_checkpoint_name(topology, wbits, abits, QONNX_export, "dataflow_parent")
-        )
-        iname = parent_model.graph.input[0].name
-        oname = parent_model.graph.output[0].name
-        sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0]
-        sdp_node = getCustomOp(sdp_node)
-        sdp_node.set_nodeattr("model", prev_chkpt_name)
-        ret = execute_onnx(parent_model, {iname: input_tensor_npy}, True)
-        y = ret[oname]
-        assert np.isclose(y, output_tensor_npy).all()
-
-    @pytest.mark.parametrize("kind", ["zynq", "alveo"])
-    def test_throughput_hw(self, topology, wbits, abits, QONNX_export, kind):
-        prev_chkpt_name = get_checkpoint_name(
-            topology, wbits, abits, QONNX_export, "deploy_" + kind
-        )
-        end2end_example = "%s_w%da%d_%s" % (topology, wbits, abits, kind)
-        model = load_test_checkpoint_or_skip(prev_chkpt_name)  # NOQA
-        cfg = get_build_env(kind, target_clk_ns)
-        if cfg["ip"] == "":
-            pytest.skip("PYNQ board IP address not specified")
-        ret = dict()
-        # try a range of batch sizes, some may fail due to insufficient DMA
-        # buffers
-        bsize_range_in = [8**i for i in range(5)]
-        bsize_range = []
-        for bsize in bsize_range_in:
-            res = throughput_test_remote(model, bsize)
-            if res is not None:
-                ret[bsize] = res
-                bsize_range.append(bsize)
-            else:
-                # assume we reached largest possible N
-                break
-        y = [ret[key]["runtime[ms]"] for key in bsize_range]
-        lrret = linregress(bsize_range, y)
-        ret_str = ""
-        ret_str += "\n" + "%s Throughput Test Results" % end2end_example
-        ret_str += "\n" + "-----------------------------"
-        ret_str += "\n" + "From linear regression:"
-        ret_str += "\n" + "Invocation overhead: %f ms" % lrret.intercept
-        ret_str += "\n" + "Time per sample: %f ms" % lrret.slope
-        ret_str += "\n" + "Raw data:"
-
-        ret_str += "\n" + "{:<8} {:<16} {:<16} {:<16} {:<16} {:<16}".format(
-            "N", "runtime[ms]", "fclk[mhz]", "fps", "DRAM rd[MB/s]", "DRAM wr[MB/s]"
-        )
-        for k in bsize_range:
-            v = ret[k]
-            ret_str += "\n" + "{:<8} {:<16} {:<16} {:<16} {:<16} {:<16}".format(
-                k,
-                np.round(v["runtime[ms]"], 4),
-                v["fclk[mhz]"],
-                np.round(v["throughput[images/s]"], 2),
-                np.round(v["DRAM_in_bandwidth[MB/s]"], 2),
-                np.round(v["DRAM_out_bandwidth[MB/s]"], 2),
-            )
-        ret_str += "\n" + "-----------------------------"
-        warnings.warn(ret_str)
-        largest_bsize = bsize_range[-1]
-        update_dashboard_data(
-            topology, wbits, abits, "fclk[mhz]", ret[largest_bsize]["fclk[mhz]"]
-        )
-        update_dashboard_data(
-            topology,
-            wbits,
-            abits,
-            "throughput[images/s]",
-            ret[largest_bsize]["throughput[images/s]"],
-        )
-
-    def test_upload_results_to_dashboard(self, topology, wbits, abits, QONNX_export):
-        # ToDo: Extend the dashboard to also upload QONNX exported models?
-        if QONNX_export:
-            pytest.skip("Dashboard data upload is disabled for QONNX exported models.")
-        else:
-            dashboard_data = get_dashboard_data(topology, wbits, abits)
-            if len(dashboard_data.keys()) > 0:
-                upload_to_end2end_dashboard(dashboard_data)
-            else:
-                pytest.skip("No data to upload to dashboard")
+        model.save(get_checkpoint_name(topology, wbits, abits, "deploy_" + board))
diff --git a/tests/end2end/test_end2end_cybsec_mlp.py b/tests/end2end/test_end2end_cybsec_mlp.py
index b6482dc96c..2de55db0d9 100644
--- a/tests/end2end/test_end2end_cybsec_mlp.py
+++ b/tests/end2end/test_end2end_cybsec_mlp.py
@@ -26,23 +26,17 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import pkg_resources as pk
-
 import pytest
 
-import brevitas.onnx as bo
 import json
 import numpy as np
 import os
 import shutil
-import subprocess
 import torch
 import torch.nn as nn
-import wget
 from brevitas.core.quant import QuantType
-from brevitas.export.onnx.generic.manager import BrevitasONNXManager
+from brevitas.export import export_qonnx
 from brevitas.nn import QuantIdentity, QuantLinear, QuantReLU
-from brevitas.quant_tensor import QuantTensor
 from qonnx.core.datatype import DataType
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.util.cleanup import cleanup as qonnx_cleanup
@@ -51,20 +45,20 @@
 import finn.builder.build_dataflow_config as build_cfg
 from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN
 from finn.util.basic import make_build_dir
-from finn.util.test import get_build_env, load_test_checkpoint_or_skip
+from finn.util.test import load_test_checkpoint_or_skip
 
 target_clk_ns = 10
-build_kind = "zynq"
+build_board = "Pynq-Z1"
 build_dir = os.environ["FINN_BUILD_DIR"]
 
 
-def get_checkpoint_name(step, QONNX_export):
+def get_checkpoint_name(step):
     if step == "build":
         # checkpoint for build step is an entire dir
-        return build_dir + "/end2end_cybsecmlp_build_QONNX-%d" % (QONNX_export)
+        return build_dir + "/end2end_cybsecmlp_build"
     else:
         # other checkpoints are onnx files
-        return build_dir + "/end2end_cybsecmlp_QONNX-%d_%s.onnx" % (QONNX_export, step)
+        return build_dir + "/end2end_cybsecmlp_%s.onnx" % step
 
 
 class CybSecMLPForExport(nn.Module):
@@ -85,10 +79,9 @@ def forward(self, x):
         return out_final
 
 
-@pytest.mark.parametrize("QONNX_export", [False, True])
 @pytest.mark.end2end
-def test_end2end_cybsec_mlp_export(QONNX_export):
-    assets_dir = pk.resource_filename("finn.qnn-data", "cybsec-mlp/")
+def test_end2end_cybsec_mlp_export():
+    assets_dir = os.environ["FINN_ROOT"] + "/src/finn/qnn-data/cybsec-mlp"
     # load up trained net in Brevitas
     input_size = 593
     hidden1 = 64
@@ -112,88 +105,57 @@ def test_end2end_cybsec_mlp_export(QONNX_export):
         QuantReLU(bit_width=act_bit_width),
         QuantLinear(hidden3, num_classes, bias=True, weight_bit_width=weight_bit_width),
     )
-    trained_state_dict = torch.load(assets_dir + "/state_dict.pth")[
-        "models_state_dict"
-    ][0]
+    trained_state_dict = torch.load(assets_dir + "/state_dict.pth")["models_state_dict"][0]
     model.load_state_dict(trained_state_dict, strict=False)
     W_orig = model[0].weight.data.detach().numpy()
     # pad the second (593-sized) dimensions with 7 zeroes at the end
     W_new = np.pad(W_orig, [(0, 0), (0, 7)])
     model[0].weight.data = torch.from_numpy(W_new)
     model_for_export = CybSecMLPForExport(model)
-    export_onnx_path = get_checkpoint_name("export", QONNX_export)
+    export_onnx_path = get_checkpoint_name("export")
     input_shape = (1, 600)
-    # create a QuantTensor instance to mark the input as bipolar during export
-    input_a = np.random.randint(0, 1, size=input_shape).astype(np.float32)
-    input_a = 2 * input_a - 1
-    scale = 1.0
-    input_t = torch.from_numpy(input_a * scale)
-    input_qt = QuantTensor(
-        input_t, scale=torch.tensor(scale), bit_width=torch.tensor(1.0), signed=True
-    )
 
-    if QONNX_export:
-        # With the BrevitasONNXManager we need to manually set
-        # the FINN DataType at the input
-        BrevitasONNXManager.export(
-            model_for_export, input_shape, export_path=export_onnx_path
-        )
-        model = ModelWrapper(export_onnx_path)
-        model.set_tensor_datatype(model.graph.input[0].name, DataType["BIPOLAR"])
-        model.save(export_onnx_path)
-        qonnx_cleanup(export_onnx_path, out_file=export_onnx_path)
-        model = ModelWrapper(export_onnx_path)
-        model = model.transform(ConvertQONNXtoFINN())
-        model.save(export_onnx_path)
-    else:
-        bo.export_finn_onnx(
-            model_for_export, export_path=export_onnx_path, input_t=input_qt
-        )
+    # With the onnx export from Brevitas we need to manually set
+    # the FINN DataType at the input
+    export_qonnx(model_for_export, torch.randn(input_shape), export_path=export_onnx_path)
+    model = ModelWrapper(export_onnx_path)
+    model.set_tensor_datatype(model.graph.input[0].name, DataType["BIPOLAR"])
+    model.save(export_onnx_path)
+    qonnx_cleanup(export_onnx_path, out_file=export_onnx_path)
+    model = ModelWrapper(export_onnx_path)
+    model = model.transform(ConvertQONNXtoFINN())
     assert os.path.isfile(export_onnx_path)
     # fix input datatype
-    finn_model = ModelWrapper(export_onnx_path)
-    finnonnx_in_tensor_name = finn_model.graph.input[0].name
-    assert tuple(finn_model.get_tensor_shape(finnonnx_in_tensor_name)) == (1, 600)
+    finnonnx_in_tensor_name = model.graph.input[0].name
+    assert tuple(model.get_tensor_shape(finnonnx_in_tensor_name)) == (1, 600)
     # verify a few exported ops
-    if QONNX_export:
-        # The first "Mul" node doesn't exist in the QONNX export,
-        # because the QuantTensor scale is not exported.
-        # However, this node would have been unity scale anyways and
-        # the models are still equivalent.
-        assert finn_model.graph.node[0].op_type == "Add"
-        assert finn_model.graph.node[1].op_type == "Div"
-        assert finn_model.graph.node[2].op_type == "MatMul"
-        assert finn_model.graph.node[-1].op_type == "MultiThreshold"
-    else:
-        assert finn_model.graph.node[0].op_type == "Mul"
-        assert finn_model.get_initializer(finn_model.graph.node[0].input[1]) == 1.0
-        assert finn_model.graph.node[1].op_type == "Add"
-        assert finn_model.graph.node[2].op_type == "Div"
-        assert finn_model.graph.node[3].op_type == "MatMul"
-        assert finn_model.graph.node[-1].op_type == "MultiThreshold"
+    # The first "Mul" node doesn't exist in the QONNX export,
+    # because the QuantTensor scale is not exported.
+    # However, this node would have been unity scale anyways and
+    # the models are still equivalent.
+    assert model.graph.node[0].op_type == "Add"
+    assert model.graph.node[1].op_type == "Div"
+    assert model.graph.node[2].op_type == "MatMul"
+    assert model.graph.node[-1].op_type == "MultiThreshold"
     # verify datatypes on some tensors
-    assert (
-        finn_model.get_tensor_datatype(finnonnx_in_tensor_name) == DataType["BIPOLAR"]
-    )
-    first_matmul_w_name = finn_model.get_nodes_by_op_type("MatMul")[0].input[1]
-    assert finn_model.get_tensor_datatype(first_matmul_w_name) == DataType["INT2"]
+    assert model.get_tensor_datatype(finnonnx_in_tensor_name) == DataType["BIPOLAR"]
+    first_matmul_w_name = model.get_nodes_by_op_type("MatMul")[0].input[1]
+    assert model.get_tensor_datatype(first_matmul_w_name) == DataType["INT2"]
 
 
 @pytest.mark.slow
 @pytest.mark.vivado
 @pytest.mark.end2end
-@pytest.mark.parametrize("QONNX_export", [False, True])
-def test_end2end_cybsec_mlp_build(QONNX_export):
-    model_file = get_checkpoint_name("export", QONNX_export)
+def test_end2end_cybsec_mlp_build():
+    model_file = get_checkpoint_name("export")
     load_test_checkpoint_or_skip(model_file)
-    build_env = get_build_env(build_kind, target_clk_ns)
-    output_dir = make_build_dir(f"test_end2end_cybsec_mlp_build_QONNX-{QONNX_export}")
+    output_dir = make_build_dir("test_end2end_cybsec_mlp_build")
 
     cfg = build.DataflowBuildConfig(
         output_dir=output_dir,
         target_fps=1000000,
         synth_clk_period_ns=target_clk_ns,
-        board=build_env["board"],
+        board=build_board,
         shell_flow_type=build_cfg.ShellFlowType.VIVADO_ZYNQ,
         generate_outputs=[
             build_cfg.DataflowOutputType.ESTIMATE_REPORTS,
@@ -223,64 +185,7 @@ def test_end2end_cybsec_mlp_build(QONNX_export):
         assert est_cycles_dict["MatrixVectorActivation_1"] == 64
     with open(est_res_report, "r") as f:
         est_res_dict = json.load(f)
-        assert est_res_dict["total"]["LUT"] == 11360.0
+        assert est_res_dict["total"]["LUT"] == 7904.0
         assert est_res_dict["total"]["BRAM_18K"] == 36.0
-    shutil.copytree(output_dir + "/deploy", get_checkpoint_name("build", QONNX_export))
-
-
-@pytest.mark.end2end
-@pytest.mark.parametrize("QONNX_export", [False, True])
-def test_end2end_cybsec_mlp_run_on_hw(QONNX_export):
-    build_env = get_build_env(build_kind, target_clk_ns)
-    assets_dir = pk.resource_filename("finn.qnn-data", "cybsec-mlp/")
-    deploy_dir = get_checkpoint_name("build", QONNX_export)
-    if not os.path.isdir(deploy_dir):
-        pytest.skip(deploy_dir + " not found from previous test step, skipping")
-    driver_dir = deploy_dir + "/driver"
-    assert os.path.isdir(driver_dir)
-    # put all assets into driver dir
-    shutil.copy(assets_dir + "/validate-unsw-nb15.py", driver_dir)
-    # put a copy of binarized dataset into driver dir
-    dataset_url = (
-        "https://zenodo.org/record/4519767/files/unsw_nb15_binarized.npz?download=1"
-    )
-    dataset_local = driver_dir + "/unsw_nb15_binarized.npz"
-    if not os.path.isfile(dataset_local):
-        wget.download(dataset_url, out=dataset_local)
-    assert os.path.isfile(dataset_local)
-    # create a shell script for running validation: 10 batches x 10 imgs
-    with open(driver_dir + "/validate.sh", "w") as f:
-        f.write(
-            """#!/bin/bash
-cd %s/driver
-echo %s | sudo -S python3.6 validate-unsw-nb15.py --batchsize=10 --limit_batches=10
-        """
-            % (
-                build_env["target_dir"] + "/end2end_cybsecmlp_build",
-                build_env["password"],
-            )
-        )
-    # set up rsync command
-    remote_target = "%s@%s:%s" % (
-        build_env["username"],
-        build_env["ip"],
-        build_env["target_dir"],
-    )
-    rsync_res = subprocess.run(["rsync", "-avz", deploy_dir, remote_target])
-    assert rsync_res.returncode == 0
-    remote_verif_cmd = [
-        "ssh",
-        "%s@%s" % (build_env["username"], build_env["ip"]),
-        "sh",
-        build_env["target_dir"] + "/end2end_cybsecmlp_build/driver/validate.sh",
-    ]
-    verif_res = subprocess.run(
-        remote_verif_cmd,
-        stdout=subprocess.PIPE,
-        universal_newlines=True,
-        input=build_env["password"],
-    )
-    assert verif_res.returncode == 0
-    log_output = verif_res.stdout.split("\n")
-    assert log_output[-3] == "batch 10 / 10 : total OK 93 NOK 7"
-    assert log_output[-2] == "Final accuracy: 93.000000"
+    shutil.copytree(output_dir + "/deploy", get_checkpoint_name("build"))
+    shutil.rmtree(get_checkpoint_name("build"))
diff --git a/tests/end2end/test_end2end_mobilenet_v1.py b/tests/end2end/test_end2end_mobilenet_v1.py
index 2f4df956ac..2d25a2bf0d 100644
--- a/tests/end2end/test_end2end_mobilenet_v1.py
+++ b/tests/end2end/test_end2end_mobilenet_v1.py
@@ -27,11 +27,11 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 import pytest
 
-import brevitas.onnx as bo
 import numpy as np
 import os
 import time
 import torch
+from brevitas.export import export_qonnx
 from PIL import Image
 from qonnx.core.datatype import DataType
 from qonnx.core.modelwrapper import ModelWrapper
@@ -52,6 +52,7 @@
 from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul
 from qonnx.transformation.merge_onnx_models import MergeONNXModels
 from qonnx.transformation.remove import RemoveIdentityOps
+from qonnx.util.cleanup import cleanup as qonnx_cleanup
 
 import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
 import finn.transformation.streamline.absorb as absorb
@@ -63,6 +64,7 @@
 )
 from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN
 from finn.transformation.streamline import Streamline
 from finn.transformation.streamline.collapse_repeated import CollapseRepeatedMul
 from finn.transformation.streamline.round_thresholds import RoundAndClipThresholds
@@ -95,12 +97,12 @@ def test_end2end_mobilenet_export():
     std = 0.226
     ch = 3
     preproc = NormalizePreProc(mean, std, ch)
-    bo.export_finn_onnx(preproc, (1, 3, 224, 224), preproc_onnx)
+    export_qonnx(preproc, torch.randn(1, 3, 224, 224), preproc_onnx)
+    qonnx_cleanup(preproc_onnx, out_file=preproc_onnx)
     preproc_model = ModelWrapper(preproc_onnx)
+    preproc_model = preproc_model.transform(ConvertQONNXtoFINN())
     # set input finn datatype to UINT8
-    preproc_model.set_tensor_datatype(
-        preproc_model.graph.input[0].name, DataType["UINT8"]
-    )
+    preproc_model.set_tensor_datatype(preproc_model.graph.input[0].name, DataType["UINT8"])
     preproc_model = preproc_model.transform(InferShapes())
     preproc_model = preproc_model.transform(FoldConstants())
     preproc_model = preproc_model.transform(GiveUniqueNodeNames())
@@ -111,7 +113,8 @@ def test_end2end_mobilenet_export():
     # export mobilenet
     finn_onnx = build_dir + "/end2end_mobilenet_export.onnx"
     mobilenet = get_test_model_trained("mobilenet", 4, 4)
-    bo.export_finn_onnx(mobilenet, (1, 3, 224, 224), finn_onnx)
+    export_qonnx(mobilenet, torch.randn(1, 3, 224, 224), finn_onnx)
+    qonnx_cleanup(finn_onnx, out_file=finn_onnx)
 
     # calculate golden output with pytorch/brevitas and save as .npy
     # get single image as input and prepare image
@@ -145,10 +148,9 @@ def test_end2end_mobilenet_export():
 
 @pytest.mark.end2end
 def test_end2end_mobilenet_tidy_and_merge_with_preproc():
-    preproc_model = load_test_checkpoint_or_skip(
-        build_dir + "/end2end_mobilenet_preproc.onnx"
-    )
+    preproc_model = load_test_checkpoint_or_skip(build_dir + "/end2end_mobilenet_preproc.onnx")
     model = load_test_checkpoint_or_skip(build_dir + "/end2end_mobilenet_export.onnx")
+    model = model.transform(ConvertQONNXtoFINN())
     model = model.transform(InferShapes())
     model = model.transform(FoldConstants())
     model = model.transform(InsertTopK())
@@ -191,17 +193,13 @@ def test_end2end_mobilenet_streamline():
         model = model.transform(GiveReadableTensorNames())
         model = model.transform(InferDataTypes())
     model.save(build_dir + "/end2end_mobilenet_streamlined.onnx")
-    assert (
-        len(model.get_nodes_by_op_type("Add")) == 1
-    )  # only final quantized bias Add op remains
+    assert len(model.get_nodes_by_op_type("Add")) == 1  # only final quantized bias Add op remains
     assert len(model.get_nodes_by_op_type("Mul")) == 0  # no Mul ops remain
 
 
 @pytest.mark.end2end
 def test_end2end_mobilenet_lowering():
-    model = load_test_checkpoint_or_skip(
-        build_dir + "/end2end_mobilenet_streamlined.onnx"
-    )
+    model = load_test_checkpoint_or_skip(build_dir + "/end2end_mobilenet_streamlined.onnx")
     model = model.transform(LowerConvsToMatMul())
     model = model.transform(absorb.AbsorbTransposeIntoMultiThreshold())
     model = model.transform(absorb.AbsorbConsecutiveTransposes())
@@ -229,9 +227,7 @@ def test_end2end_mobilenet_convert_to_hls_layers():
 
 @pytest.mark.end2end
 def test_end2end_mobilenet_folding():
-    model = load_test_checkpoint_or_skip(
-        build_dir + "/end2end_mobilenet_hls_layers.onnx"
-    )
+    model = load_test_checkpoint_or_skip(build_dir + "/end2end_mobilenet_hls_layers.onnx")
     # optional extra folding to use fewer resources
     # applied while setting the attributes on each node
     assert extra_fold in [1, 2, 4]
diff --git a/tests/end2end/test_ext_weights.py b/tests/end2end/test_ext_weights.py
index 9483ccf0b2..2f5f136d3a 100644
--- a/tests/end2end/test_ext_weights.py
+++ b/tests/end2end/test_ext_weights.py
@@ -26,8 +26,6 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import pkg_resources as pk
-
 import pytest
 
 import os
@@ -38,10 +36,9 @@
 import finn.builder.build_dataflow as build
 import finn.builder.build_dataflow_config as build_cfg
 from finn.util.basic import make_build_dir
-from finn.util.test import get_build_env, load_test_checkpoint_or_skip
+from finn.util.test import load_test_checkpoint_or_skip
 
 target_clk_ns = 10
-build_kind = "zynq"
 build_dir = os.environ["FINN_BUILD_DIR"]
 onnx_zip_url = "https://github.com/Xilinx/finn-examples"
 onnx_zip_url += "/releases/download/v0.0.1a/onnx-models-bnn-pynq.zip"
@@ -83,16 +80,15 @@ def test_end2end_ext_weights_download():
 def test_end2end_ext_weights_build():
     model_file = get_checkpoint_name("download")
     load_test_checkpoint_or_skip(model_file)
-    build_env = get_build_env(build_kind, target_clk_ns)
-    folding_config_file = pk.resource_filename(
-        "finn.qnn-data", "test_ext_weights/tfc-w1a1-extw.json"
-    )
+    test_data = os.environ["FINN_ROOT"] + "/src/finn/qnn-data/test_ext_weights"
+    folding_config_file = test_data + "/tfc-w1a1-extw.json"
     output_dir = make_build_dir("test_end2end_ext_weights_build")
     cfg = build.DataflowBuildConfig(
         output_dir=output_dir,
+        verbose=True,
         folding_config_file=folding_config_file,
         synth_clk_period_ns=target_clk_ns,
-        board=build_env["board"],
+        board="Pynq-Z1",
         shell_flow_type=build_cfg.ShellFlowType.VIVADO_ZYNQ,
         generate_outputs=[
             build_cfg.DataflowOutputType.ESTIMATE_REPORTS,
@@ -109,67 +105,3 @@ def test_end2end_ext_weights_build():
     if os.path.isdir(get_checkpoint_name("build")):
         shutil.rmtree(get_checkpoint_name("build"))
     shutil.copytree(output_dir + "/deploy", get_checkpoint_name("build"))
-
-
-@pytest.mark.board
-@pytest.mark.end2end
-def test_end2end_ext_weights_dataset():
-    # make sure we have local copies of mnist dataset files
-    subprocess.check_output(["mkdir", "-p", mnist_local])
-    for f in mnist_files:
-        if not os.path.isfile(mnist_local + "/" + f):
-            wget.download(mnist_url + "/" + f, out=mnist_local + "/" + f)
-        assert os.path.isfile(mnist_local + "/" + f)
-    # rsync to board
-    build_env = get_build_env(build_kind, target_clk_ns)
-    mnist_target = "%s@%s:%s" % (build_env["username"], build_env["ip"], "/tmp/")
-
-    rsync_dataset_cmd = ["rsync", "-rv", mnist_local + "/", mnist_target]
-    subprocess.check_output(rsync_dataset_cmd)
-
-
-@pytest.mark.end2end
-def test_end2end_ext_weights_run_on_hw():
-    build_env = get_build_env(build_kind, target_clk_ns)
-    deploy_dir = get_checkpoint_name("build")
-    if not os.path.isdir(deploy_dir):
-        pytest.skip(deploy_dir + " not found from previous test step, skipping")
-    driver_dir = deploy_dir + "/driver"
-    assert os.path.isdir(driver_dir)
-    # create a shell script for running validation: 10 batches x 10 imgs
-    with open(driver_dir + "/validate.sh", "w") as f:
-        f.write(
-            """#!/bin/bash
-cd %s/driver
-echo %s | sudo -S python3.6 validate.py --dataset mnist --bitfile %s
-        """
-            % (
-                build_env["target_dir"] + "/end2end_ext_weights_build",
-                build_env["password"],
-                "../bitfile/finn-accel.bit",
-            )
-        )
-    # set up rsync command
-    remote_target = "%s@%s:%s" % (
-        build_env["username"],
-        build_env["ip"],
-        build_env["target_dir"],
-    )
-    rsync_res = subprocess.run(["rsync", "-avz", deploy_dir, remote_target])
-    assert rsync_res.returncode == 0
-    remote_verif_cmd = [
-        "ssh",
-        "%s@%s" % (build_env["username"], build_env["ip"]),
-        "sh",
-        build_env["target_dir"] + "/end2end_ext_weights_build/driver/validate.sh",
-    ]
-    verif_res = subprocess.run(
-        remote_verif_cmd,
-        stdout=subprocess.PIPE,
-        universal_newlines=True,
-        input=build_env["password"],
-    )
-    assert verif_res.returncode == 0
-    log_output = verif_res.stdout.split("\n")
-    assert log_output[-3] == "batch 100 / 100 : total OK 9296 NOK 704"
-    assert log_output[-2] == "Final accuracy: 92.960000"
diff --git a/tests/fpgadataflow/test_code_gen_trafo.py b/tests/fpgadataflow/test_code_gen_trafo.py
index 49ee32c71e..f5edabbd4b 100644
--- a/tests/fpgadataflow/test_code_gen_trafo.py
+++ b/tests/fpgadataflow/test_code_gen_trafo.py
@@ -32,7 +32,7 @@
 from onnx import TensorProto, helper
 from qonnx.core.datatype import DataType
 from qonnx.core.modelwrapper import ModelWrapper
-from qonnx.util.basic import gen_finn_dt_tensor, get_by_name
+from qonnx.util.basic import gen_finn_dt_tensor, get_by_name, qonnx_make_model
 
 from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
 
@@ -70,7 +70,7 @@ def test_code_gen_trafo():
         nodes=[FCLayer_node], name="fclayer_graph", inputs=[inp], outputs=[outp]
     )
 
-    model = helper.make_model(graph, producer_name="fclayer-model")
+    model = qonnx_make_model(graph, producer_name="fclayer-model")
     model = ModelWrapper(model)
 
     model.set_tensor_datatype("inp", idt)
diff --git a/tests/fpgadataflow/test_compilation_trafo.py b/tests/fpgadataflow/test_compilation_trafo.py
index 9bafb101ce..d04b68a56b 100644
--- a/tests/fpgadataflow/test_compilation_trafo.py
+++ b/tests/fpgadataflow/test_compilation_trafo.py
@@ -32,7 +32,7 @@
 from onnx import TensorProto, helper
 from qonnx.core.datatype import DataType
 from qonnx.core.modelwrapper import ModelWrapper
-from qonnx.util.basic import gen_finn_dt_tensor, get_by_name
+from qonnx.util.basic import gen_finn_dt_tensor, get_by_name, qonnx_make_model
 
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
 from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
@@ -71,7 +71,7 @@ def test_compilation_trafo():
         nodes=[FCLayer_node], name="fclayer_graph", inputs=[inp], outputs=[outp]
     )
 
-    model = helper.make_model(graph, producer_name="fclayer-model")
+    model = qonnx_make_model(graph, producer_name="fclayer-model")
     model = ModelWrapper(model)
 
     model.set_tensor_datatype("inp", idt)
diff --git a/tests/fpgadataflow/test_convert_to_hls_1d_conv_layer.py b/tests/fpgadataflow/test_convert_to_hls_1d_conv_layer.py
index 5bbaefac2d..2af0957e12 100644
--- a/tests/fpgadataflow/test_convert_to_hls_1d_conv_layer.py
+++ b/tests/fpgadataflow/test_convert_to_hls_1d_conv_layer.py
@@ -38,7 +38,7 @@
 from qonnx.transformation.infer_datatypes import InferDataTypes
 from qonnx.transformation.infer_shapes import InferShapes
 from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul
-from qonnx.util.basic import gen_finn_dt_tensor
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
@@ -66,11 +66,12 @@
     ],
 )
 @pytest.mark.parametrize("depthwise", [False, True])
+@pytest.mark.parametrize("use_rtl_swg", [False, True])
 @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
 @pytest.mark.fpgadataflow
 @pytest.mark.slow
 @pytest.mark.vivado
-def test_convert_to_hls_1d_conv_layer(conv_config, depthwise, exec_mode):
+def test_convert_to_hls_1d_conv_layer(conv_config, depthwise, use_rtl_swg, exec_mode):
     pad, kernel_size, stride, dilation = conv_config
     np.random.seed(0)
     idt = DataType["UINT4"]
@@ -84,6 +85,9 @@ def test_convert_to_hls_1d_conv_layer(conv_config, depthwise, exec_mode):
     pad_h = pad[0] + pad[2]
     pad_w = pad[1] + pad[3]
 
+    if use_rtl_swg and exec_mode == "cppsim":
+        pytest.skip("cppsim not supported for RTL SWG")
+
     if depthwise is True:
         group = out_chn = in_chn
         conv_param_shape = [out_chn, 1, k_h, k_w]
@@ -92,12 +96,8 @@ def test_convert_to_hls_1d_conv_layer(conv_config, depthwise, exec_mode):
         out_chn = 20
         conv_param_shape = [out_chn, in_chn, k_h, k_w]
 
-    out_feature_dim_h = compute_conv_output_dim(
-        in_feature_dim_h, k_h, stride_h, pad_h, dilation_h
-    )
-    out_feature_dim_w = compute_conv_output_dim(
-        in_feature_dim_w, k_w, stride_w, pad_w, dilation_w
-    )
+    out_feature_dim_h = compute_conv_output_dim(in_feature_dim_h, k_h, stride_h, pad_h, dilation_h)
+    out_feature_dim_w = compute_conv_output_dim(in_feature_dim_w, k_w, stride_w, pad_w, dilation_w)
 
     input_shape = [1, in_chn, in_feature_dim_h, in_feature_dim_w]
     output_shape = [1, out_chn, out_feature_dim_h, out_feature_dim_w]
@@ -113,19 +113,15 @@ def test_convert_to_hls_1d_conv_layer(conv_config, depthwise, exec_mode):
 
     top_in = helper.make_tensor_value_info("top_in", TensorProto.FLOAT, input_shape)
     top_out = helper.make_tensor_value_info("top_out", TensorProto.FLOAT, output_shape)
-    value_info = [
-        helper.make_tensor_value_info("p1", TensorProto.FLOAT, conv_param_shape)
-    ]
+    value_info = [helper.make_tensor_value_info("p1", TensorProto.FLOAT, conv_param_shape)]
 
-    modelproto = helper.make_model(
+    modelproto = qonnx_make_model(
         helper.make_graph(
             name="conv_test",
             inputs=[top_in],
             outputs=[top_out],
             value_info=value_info,
-            nodes=[
-                helper.make_node("Conv", ["top_in", "p1"], ["top_out"], **conv_config)
-            ],
+            nodes=[helper.make_node("Conv", ["top_in", "p1"], ["top_out"], **conv_config)],
         )
     )
 
@@ -139,7 +135,7 @@ def test_convert_to_hls_1d_conv_layer(conv_config, depthwise, exec_mode):
     model = model.transform(InferDataTypes())
 
     new_model = model.transform(LowerConvsToMatMul())
-    new_model = new_model.transform(to_hls.InferConvInpGen())
+    new_model = new_model.transform(to_hls.InferConvInpGen(use_rtl_variant=use_rtl_swg))
     if depthwise is True:
         new_model = new_model.transform(to_hls.InferVectorVectorActivation())
     else:
diff --git a/tests/fpgadataflow/test_convert_to_hls_channelwise_layer.py b/tests/fpgadataflow/test_convert_to_hls_channelwise_layer.py
index 0f19b6d79a..bb2c1d74c2 100644
--- a/tests/fpgadataflow/test_convert_to_hls_channelwise_layer.py
+++ b/tests/fpgadataflow/test_convert_to_hls_channelwise_layer.py
@@ -35,7 +35,7 @@
 from qonnx.transformation.general import GiveUniqueNodeNames
 from qonnx.transformation.infer_data_layouts import InferDataLayouts
 from qonnx.transformation.infer_shapes import InferShapes
-from qonnx.util.basic import gen_finn_dt_tensor
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
@@ -52,12 +52,11 @@ def prepare_inputs(input_tensor):
 
 
 def make_single_maxpool_modelwrapper(onnx_op_name, ishape, idt, pdt, pshape):
-
     inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, ishape)
     outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, ishape)
     p0 = helper.make_tensor_value_info("p0", TensorProto.FLOAT, pshape)
 
-    model = helper.make_model(
+    model = qonnx_make_model(
         helper.make_graph(
             name="test",
             inputs=[inp],
@@ -76,13 +75,9 @@ def make_single_maxpool_modelwrapper(onnx_op_name, ishape, idt, pdt, pshape):
 
 
 # parameter datatype
-@pytest.mark.parametrize(
-    "pdt", [DataType["BIPOLAR"], DataType["UINT4"], DataType["INT2"]]
-)
+@pytest.mark.parametrize("pdt", [DataType["BIPOLAR"], DataType["UINT4"], DataType["INT2"]])
 # input datatype
-@pytest.mark.parametrize(
-    "idt", [DataType["INT32"], DataType["UINT4"], DataType["INT4"]]
-)
+@pytest.mark.parametrize("idt", [DataType["INT32"], DataType["UINT4"], DataType["INT4"]])
 # function
 @pytest.mark.parametrize("onnx_op_name", ["Add", "Mul"])
 # vector parameter or scalar parameter (broadcast)
@@ -92,9 +87,7 @@ def make_single_maxpool_modelwrapper(onnx_op_name, ishape, idt, pdt, pshape):
 @pytest.mark.fpgadataflow
 @pytest.mark.vivado
 @pytest.mark.slow
-def test_convert_to_hls_channelwise_layer(
-    pdt, idt, onnx_op_name, scalar_param, exec_mode
-):
+def test_convert_to_hls_channelwise_layer(pdt, idt, onnx_op_name, scalar_param, exec_mode):
     ifm_ch = 16
     ifm_dim = 5
     ishape = (1, ifm_ch, ifm_dim, ifm_dim)
@@ -134,9 +127,7 @@ def test_convert_to_hls_channelwise_layer(
     else:
         raise Exception("Unknown exec_mode")
 
-    ctx_produced = oxe.execute_onnx(
-        new_model, input_dict, return_full_exec_context=True
-    )
+    ctx_produced = oxe.execute_onnx(new_model, input_dict, return_full_exec_context=True)
     y_produced = ctx_produced["outp"]
 
     assert (y_produced == y_expected).all()
diff --git a/tests/fpgadataflow/test_convert_to_hls_conv_fc_transition.py b/tests/fpgadataflow/test_convert_to_hls_conv_fc_transition.py
index 0760ff9b37..94007bdd14 100755
--- a/tests/fpgadataflow/test_convert_to_hls_conv_fc_transition.py
+++ b/tests/fpgadataflow/test_convert_to_hls_conv_fc_transition.py
@@ -39,7 +39,7 @@
 from qonnx.transformation.infer_datatypes import InferDataTypes
 from qonnx.transformation.infer_shapes import InferShapes
 from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul
-from qonnx.util.basic import gen_finn_dt_tensor
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
@@ -102,12 +102,8 @@ def test_convert_to_hls_conv_fc_transition(conv_config, depthwise, use_reshape):
         out_chn = 8
         conv_param_shape = [out_chn, in_chn, kernel_size_h, kernel_size_w]
 
-    output_size_h = compute_conv_output_dim(
-        input_size_h, kernel_size_h, stride_h, 2 * pad_h
-    )
-    output_size_w = compute_conv_output_dim(
-        input_size_w, kernel_size_w, stride_w, 2 * pad_w
-    )
+    output_size_h = compute_conv_output_dim(input_size_h, kernel_size_h, stride_h, 2 * pad_h)
+    output_size_w = compute_conv_output_dim(input_size_w, kernel_size_w, stride_w, 2 * pad_w)
 
     input_shape = [1, in_chn, input_size_h, input_size_w]
     fc_param_shape = [out_chn * output_size_h * output_size_w, fc_filters]
@@ -120,45 +116,29 @@ def test_convert_to_hls_conv_fc_transition(conv_config, depthwise, use_reshape):
     conv_config["pads"] = [pad_h, pad_w, pad_h, pad_w]
     conv_config["strides"] = [stride_h, stride_w]
 
-    global_in = helper.make_tensor_value_info(
-        "global_in", TensorProto.FLOAT, input_shape
-    )
-    global_out = helper.make_tensor_value_info(
-        "global_out", TensorProto.FLOAT, output_shape
-    )
+    global_in = helper.make_tensor_value_info("global_in", TensorProto.FLOAT, input_shape)
+    global_out = helper.make_tensor_value_info("global_out", TensorProto.FLOAT, output_shape)
     value_info = [
-        helper.make_tensor_value_info(
-            "conv_param", TensorProto.FLOAT, conv_param_shape
-        ),
+        helper.make_tensor_value_info("conv_param", TensorProto.FLOAT, conv_param_shape),
         helper.make_tensor_value_info("thres1_param", TensorProto.FLOAT, (out_chn, 15)),
-        helper.make_tensor_value_info(
-            "matmul_param", TensorProto.FLOAT, fc_param_shape
-        ),
-        helper.make_tensor_value_info(
-            "thres2_param", TensorProto.FLOAT, (fc_filters, 15)
-        ),
+        helper.make_tensor_value_info("matmul_param", TensorProto.FLOAT, fc_param_shape),
+        helper.make_tensor_value_info("thres2_param", TensorProto.FLOAT, (fc_filters, 15)),
         helper.make_tensor_value_info("reshape_shape", TensorProto.INT64, []),
     ]
 
     if use_reshape:
-        flatten_node = helper.make_node(
-            "Reshape", ["thres1_out", "reshape_shape"], ["flatten_out"]
-        )
+        flatten_node = helper.make_node("Reshape", ["thres1_out", "reshape_shape"], ["flatten_out"])
     else:
-        flatten_node = helper.make_node(
-            "Flatten", ["thres1_out"], ["flatten_out"], axis=1
-        )
+        flatten_node = helper.make_node("Flatten", ["thres1_out"], ["flatten_out"], axis=1)
 
-    modelproto = helper.make_model(
+    modelproto = qonnx_make_model(
         helper.make_graph(
             name="test",
             inputs=[global_in],
             outputs=[global_out],
             value_info=value_info,
             nodes=[
-                helper.make_node(
-                    "Conv", ["global_in", "conv_param"], ["conv_out"], **conv_config
-                ),
+                helper.make_node("Conv", ["global_in", "conv_param"], ["conv_out"], **conv_config),
                 helper.make_node(
                     "MultiThreshold",
                     ["conv_out", "thres1_param"],
@@ -167,9 +147,7 @@ def test_convert_to_hls_conv_fc_transition(conv_config, depthwise, use_reshape):
                     out_dtype="UINT4",
                 ),
                 flatten_node,
-                helper.make_node(
-                    "MatMul", ["flatten_out", "matmul_param"], ["matmul_out"]
-                ),
+                helper.make_node("MatMul", ["flatten_out", "matmul_param"], ["matmul_out"]),
                 helper.make_node(
                     "MultiThreshold",
                     ["matmul_out", "thres2_param"],
@@ -190,18 +168,10 @@ def test_convert_to_hls_conv_fc_transition(conv_config, depthwise, use_reshape):
     model.set_tensor_datatype("thres1_param", DataType["INT32"])
     model.set_tensor_datatype("thres2_param", DataType["INT32"])
 
-    model.set_initializer(
-        "conv_param", gen_finn_dt_tensor(conv_weight_dt, conv_param_shape)
-    )
-    model.set_initializer(
-        "thres1_param", get_multithreshold_rand_params(out_chn, 15, seed=0)
-    )
-    model.set_initializer(
-        "thres2_param", get_multithreshold_rand_params(fc_filters, 15, seed=0)
-    )
-    model.set_initializer(
-        "matmul_param", gen_finn_dt_tensor(fc_weight_dt, fc_param_shape)
-    )
+    model.set_initializer("conv_param", gen_finn_dt_tensor(conv_weight_dt, conv_param_shape))
+    model.set_initializer("thres1_param", get_multithreshold_rand_params(out_chn, 15, seed=0))
+    model.set_initializer("thres2_param", get_multithreshold_rand_params(fc_filters, 15, seed=0))
+    model.set_initializer("matmul_param", gen_finn_dt_tensor(fc_weight_dt, fc_param_shape))
     model.set_initializer("reshape_shape", np.array([1, -1], dtype=np.int64))
 
     model = model.transform(InferShapes())
diff --git a/tests/fpgadataflow/test_convert_to_hls_conv_layer.py b/tests/fpgadataflow/test_convert_to_hls_conv_layer.py
index 55dc77cafb..95beffafac 100644
--- a/tests/fpgadataflow/test_convert_to_hls_conv_layer.py
+++ b/tests/fpgadataflow/test_convert_to_hls_conv_layer.py
@@ -38,7 +38,7 @@
 from qonnx.transformation.infer_datatypes import InferDataTypes
 from qonnx.transformation.infer_shapes import InferShapes
 from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul
-from qonnx.util.basic import gen_finn_dt_tensor
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
@@ -57,11 +57,12 @@
     "conv_config", [(1, 2, 0), (1, 3, 0), (3, 2, 1), (3, 1, 0), (3, 1, 1), (5, 2, 1)]
 )
 @pytest.mark.parametrize("depthwise", [False, True])
+@pytest.mark.parametrize("use_rtl_swg", [False, True])
 @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
 @pytest.mark.fpgadataflow
 @pytest.mark.slow
 @pytest.mark.vivado
-def test_convert_to_hls_conv_layer(conv_config, depthwise, exec_mode):
+def test_convert_to_hls_conv_layer(conv_config, depthwise, use_rtl_swg, exec_mode):
     kernel_size, stride, pad = conv_config
     np.random.seed(0)
     idt = DataType["UINT4"]
@@ -69,6 +70,9 @@ def test_convert_to_hls_conv_layer(conv_config, depthwise, exec_mode):
     in_feature_dim = 7
     in_chn = 16
 
+    if use_rtl_swg and exec_mode == "cppsim":
+        pytest.skip("cppsim not supported for RTL SWG")
+
     if depthwise is True:
         group = out_chn = in_chn
         conv_param_shape = [out_chn, 1, kernel_size, kernel_size]
@@ -78,9 +82,7 @@ def test_convert_to_hls_conv_layer(conv_config, depthwise, exec_mode):
         conv_param_shape = [out_chn, in_chn, kernel_size, kernel_size]
 
     total_pad = 2 * pad
-    out_feature_dim = compute_conv_output_dim(
-        in_feature_dim, kernel_size, stride, total_pad
-    )
+    out_feature_dim = compute_conv_output_dim(in_feature_dim, kernel_size, stride, total_pad)
 
     input_shape = [1, in_chn, in_feature_dim, in_feature_dim]
     output_shape = [1, out_chn, out_feature_dim, out_feature_dim]
@@ -96,19 +98,15 @@ def test_convert_to_hls_conv_layer(conv_config, depthwise, exec_mode):
 
     top_in = helper.make_tensor_value_info("top_in", TensorProto.FLOAT, input_shape)
     top_out = helper.make_tensor_value_info("top_out", TensorProto.FLOAT, output_shape)
-    value_info = [
-        helper.make_tensor_value_info("p1", TensorProto.FLOAT, conv_param_shape)
-    ]
+    value_info = [helper.make_tensor_value_info("p1", TensorProto.FLOAT, conv_param_shape)]
 
-    modelproto = helper.make_model(
+    modelproto = qonnx_make_model(
         helper.make_graph(
             name="conv_test",
             inputs=[top_in],
             outputs=[top_out],
             value_info=value_info,
-            nodes=[
-                helper.make_node("Conv", ["top_in", "p1"], ["top_out"], **conv_config)
-            ],
+            nodes=[helper.make_node("Conv", ["top_in", "p1"], ["top_out"], **conv_config)],
         )
     )
 
@@ -122,7 +120,7 @@ def test_convert_to_hls_conv_layer(conv_config, depthwise, exec_mode):
     model = model.transform(InferDataTypes())
 
     new_model = model.transform(LowerConvsToMatMul())
-    new_model = new_model.transform(to_hls.InferConvInpGen())
+    new_model = new_model.transform(to_hls.InferConvInpGen(use_rtl_variant=use_rtl_swg))
     if depthwise is True:
         new_model = new_model.transform(to_hls.InferVectorVectorActivation())
     else:
@@ -156,7 +154,8 @@ def test_convert_to_hls_conv_layer(conv_config, depthwise, exec_mode):
     x = gen_finn_dt_tensor(idt, input_shape)
     inp_dict = {model.graph.input[0].name: x}
     assert oxe.compare_execution(model, new_model, inp_dict)
-    if kernel_size == 1 and stride > 1 and pad == 0:
+
+    if not use_rtl_swg and kernel_size == 1 and stride > 1 and pad == 0:
         assert new_model.graph.node[1].op_type == "DownSampler"
         if exec_mode == "rtlsim":
             node = new_model.get_nodes_by_op_type("DownSampler")[0]
@@ -167,8 +166,11 @@ def test_convert_to_hls_conv_layer(conv_config, depthwise, exec_mode):
             assert np.isclose(exp_cycles, cycles_rtlsim, atol=11)
             assert exp_cycles != 0
 
-    if pad == 1:
-        padding_node = new_model.get_nodes_by_op_type("FMPadding_Batch")[0]
+    if pad:
+        if use_rtl_swg:
+            padding_node = new_model.get_nodes_by_op_type("FMPadding_rtl")[0]
+        else:
+            padding_node = new_model.get_nodes_by_op_type("FMPadding_Batch")[0]
         padding_inst = getCustomOp(padding_node)
         assert padding_inst.get_nodeattr("SIMD") == in_chn
 
diff --git a/tests/fpgadataflow/test_convert_to_hls_layers_cnv.py b/tests/fpgadataflow/test_convert_to_hls_layers_cnv.py
index 9997f28438..c9cb4f0802 100644
--- a/tests/fpgadataflow/test_convert_to_hls_layers_cnv.py
+++ b/tests/fpgadataflow/test_convert_to_hls_layers_cnv.py
@@ -26,21 +26,26 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import pkg_resources as pk
-
 import pytest
 
-import brevitas.onnx as bo
+import importlib_resources as importlib
 import numpy as np
 import os
+import torch
+from brevitas.export import export_qonnx
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.bipolar_to_xnor import ConvertBipolarMatMulToXnorPopcount
 from qonnx.transformation.fold_constants import FoldConstants
-from qonnx.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
+from qonnx.transformation.general import (
+    GiveReadableTensorNames,
+    GiveUniqueNodeNames,
+    GiveUniqueParameterTensors,
+)
 from qonnx.transformation.infer_data_layouts import InferDataLayouts
 from qonnx.transformation.infer_shapes import InferShapes
 from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul
+from qonnx.util.cleanup import cleanup as qonnx_cleanup
 
 import finn.core.onnx_exec as oxe
 import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
@@ -48,6 +53,7 @@
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
 from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN
 from finn.transformation.streamline import Streamline
 from finn.transformation.streamline.reorder import MakeMaxPoolNHWC
 from finn.util.test import get_test_model_trained
@@ -61,11 +67,14 @@
 @pytest.mark.parametrize("fused_activation", [True, False])
 def test_convert_to_hls_layers_cnv_w1a1(fused_activation):
     cnv = get_test_model_trained("CNV", 1, 1)
-    bo.export_finn_onnx(cnv, (1, 3, 32, 32), export_onnx_path_cnv)
+    export_qonnx(cnv, torch.randn(1, 3, 32, 32), export_onnx_path_cnv)
+    qonnx_cleanup(export_onnx_path_cnv, out_file=export_onnx_path_cnv)
     model = ModelWrapper(export_onnx_path_cnv)
+    model = model.transform(ConvertQONNXtoFINN())
     model = model.transform(InferShapes())
     model = model.transform(FoldConstants())
     model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(GiveUniqueParameterTensors())
     model = model.transform(GiveReadableTensorNames())
     model = model.transform(Streamline())
     model = model.transform(LowerConvsToMatMul())
@@ -75,10 +84,10 @@ def test_convert_to_hls_layers_cnv_w1a1(fused_activation):
     model = model.transform(ConvertBipolarMatMulToXnorPopcount())
     model = model.transform(Streamline())
     model = model.transform(InferDataLayouts())
-    # model.save("golden.onnx")
     # load one of the test vectors
-    fn = pk.resource_filename("finn.qnn-data", "cifar10/cifar10-test-data-class3.npz")
-    input_tensor = np.load(fn)["arr_0"].astype(np.float32)
+    ref = importlib.files("finn.qnn-data") / "cifar10/cifar10-test-data-class3.npz"
+    with importlib.as_file(ref) as fn:
+        input_tensor = np.load(fn)["arr_0"].astype(np.float32)
     input_tensor = input_tensor / 255
     assert input_tensor.shape == (1, 3, 32, 32)
     # generate expected value from streamlined net
@@ -128,11 +137,9 @@ def test_convert_to_hls_layers_cnv_w1a1(fused_activation):
     assert len(swg_nodes) == 6
     mp_nodes = model.get_nodes_by_op_type("StreamingMaxPool_Batch")
     assert len(mp_nodes) == 2
-    # model.save("cnv-pre-compile.onnx")
     model = model.transform(PrepareCppSim())
     model = model.transform(CompileCppSim())
     model = model.transform(SetExecMode("cppsim"))
-    # model.save("cnv-post-compile.onnx")
     produced_ctx = oxe.execute_onnx(model, input_dict, True)
     produced = produced_ctx[model.graph.output[0].name]
     assert np.isclose(expected, produced, atol=1e-3).all()
diff --git a/tests/fpgadataflow/test_convert_to_hls_layers_fc.py b/tests/fpgadataflow/test_convert_to_hls_layers_fc.py
index fd4e3679d7..8a7b2509a4 100644
--- a/tests/fpgadataflow/test_convert_to_hls_layers_fc.py
+++ b/tests/fpgadataflow/test_convert_to_hls_layers_fc.py
@@ -28,19 +28,24 @@
 
 import pytest
 
-import brevitas.onnx as bo
 import numpy as np
 import onnx
 import onnx.numpy_helper as nph
 import os
 import torch
+from brevitas.export import export_qonnx
 from pkgutil import get_data
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.bipolar_to_xnor import ConvertBipolarMatMulToXnorPopcount
 from qonnx.transformation.fold_constants import FoldConstants
-from qonnx.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
+from qonnx.transformation.general import (
+    GiveReadableTensorNames,
+    GiveUniqueNodeNames,
+    GiveUniqueParameterTensors,
+)
 from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.util.cleanup import cleanup as qonnx_cleanup
 
 import finn.core.onnx_exec as oxe
 import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
@@ -48,6 +53,7 @@
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
 from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN
 from finn.transformation.streamline import Streamline
 from finn.transformation.streamline.round_thresholds import RoundAndClipThresholds
 from finn.util.test import get_test_model_trained
@@ -59,11 +65,14 @@
 @pytest.mark.vivado
 def test_convert_to_hls_layers_tfc_w1a1():
     tfc = get_test_model_trained("TFC", 1, 1)
-    bo.export_finn_onnx(tfc, (1, 1, 28, 28), export_onnx_path)
+    export_qonnx(tfc, torch.randn(1, 1, 28, 28), export_onnx_path)
+    qonnx_cleanup(export_onnx_path, out_file=export_onnx_path)
     model = ModelWrapper(export_onnx_path)
+    model = model.transform(ConvertQONNXtoFINN())
     model = model.transform(InferShapes())
     model = model.transform(FoldConstants())
     model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(GiveUniqueParameterTensors())
     model = model.transform(GiveReadableTensorNames())
     model = model.transform(Streamline())
     model = model.transform(ConvertBipolarMatMulToXnorPopcount())
@@ -130,11 +139,15 @@ def test_convert_to_hls_layers_tfc_w1a1():
 @pytest.mark.vivado
 def test_convert_to_hls_layers_tfc_w1a2():
     tfc = get_test_model_trained("TFC", 1, 2)
-    bo.export_finn_onnx(tfc, (1, 1, 28, 28), export_onnx_path)
+    export_qonnx(tfc, torch.randn(1, 1, 28, 28), export_onnx_path)
+    qonnx_cleanup(export_onnx_path, out_file=export_onnx_path)
     model = ModelWrapper(export_onnx_path)
+    model = model.transform(ConvertQONNXtoFINN())
+    model.save(export_onnx_path)
     model = model.transform(InferShapes())
     model = model.transform(FoldConstants())
     model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(GiveUniqueParameterTensors())
     model = model.transform(GiveReadableTensorNames())
     model = model.transform(Streamline())
     from finn.transformation.fpgadataflow.convert_to_hls_layers import (
diff --git a/tests/fpgadataflow/test_convert_to_hls_layers_synthetic.py b/tests/fpgadataflow/test_convert_to_hls_layers_synthetic.py
index 79a48793e0..f8e566156b 100644
--- a/tests/fpgadataflow/test_convert_to_hls_layers_synthetic.py
+++ b/tests/fpgadataflow/test_convert_to_hls_layers_synthetic.py
@@ -43,7 +43,7 @@
 from qonnx.transformation.infer_datatypes import InferDataTypes
 from qonnx.transformation.infer_shapes import InferShapes
 from qonnx.transformation.insert_topk import InsertTopK
-from qonnx.util.basic import gen_finn_dt_tensor
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
@@ -91,21 +91,11 @@ def make_model(ch, ifmdim):
     add0_node = helper.make_node("Add", [inp.name, inp1_add0_ct.name], ["out_add0"])
     add1_node = helper.make_node("Add", ["out_add0", inp1_add_ct.name], [inp1_add.name])
     add2_node = helper.make_node("Add", ["out_add0", inp2_add_ct.name], [inp2_add.name])
-    mul1_node = helper.make_node(
-        "Mul", [inp1_add.name, inp1_mul_ct.name], [inp1_mul.name]
-    )
-    mul2_node = helper.make_node(
-        "Mul", [inp2_add.name, inp2_mul_ct.name], [inp2_mul.name]
-    )
-    eltwise_add_node = helper.make_node(
-        "Add", [inp1_mul.name, inp2_mul.name], [eltwise_add.name]
-    )
-    globalavgpool_node = helper.make_node(
-        "GlobalAveragePool", [eltwise_add.name], [pool.name]
-    )
-    reshape_node = helper.make_node(
-        "Reshape", [pool.name, reshape_ct.name], [outp.name]
-    )
+    mul1_node = helper.make_node("Mul", [inp1_add.name, inp1_mul_ct.name], [inp1_mul.name])
+    mul2_node = helper.make_node("Mul", [inp2_add.name, inp2_mul_ct.name], [inp2_mul.name])
+    eltwise_add_node = helper.make_node("Add", [inp1_mul.name, inp2_mul.name], [eltwise_add.name])
+    globalavgpool_node = helper.make_node("GlobalAveragePool", [eltwise_add.name], [pool.name])
+    reshape_node = helper.make_node("Reshape", [pool.name, reshape_ct.name], [outp.name])
 
     graph = helper.make_graph(
         nodes=[
@@ -123,7 +113,7 @@ def make_model(ch, ifmdim):
         outputs=[outp],
     )
 
-    model = helper.make_model(graph, producer_name="add-model")
+    model = qonnx_make_model(graph, producer_name="add-model")
     model = ModelWrapper(model)
 
     # set initializers for scalar add/mul nodes
diff --git a/tests/fpgadataflow/test_convert_to_hls_pool_batch.py b/tests/fpgadataflow/test_convert_to_hls_pool_batch.py
index ef9bd7a13d..417b4fbae2 100644
--- a/tests/fpgadataflow/test_convert_to_hls_pool_batch.py
+++ b/tests/fpgadataflow/test_convert_to_hls_pool_batch.py
@@ -35,7 +35,7 @@
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.general import GiveUniqueNodeNames
 from qonnx.transformation.infer_shapes import InferShapes
-from qonnx.util.basic import gen_finn_dt_tensor
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
@@ -48,9 +48,7 @@
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
 
 
-def make_single_maxpool_modelwrapper(
-    k, stride, pad, ifm_ch, ifm_dim, ofm_dim, idt, use_1d=False
-):
+def make_single_maxpool_modelwrapper(k, stride, pad, ifm_ch, ifm_dim, ofm_dim, idt, use_1d=False):
     odt = idt
     if use_1d:
         ishape = [1, ifm_ch, 1, ifm_dim]
@@ -74,11 +72,9 @@ def make_single_maxpool_modelwrapper(
         pads=pads,
         strides=strides,
     )
-    graph = helper.make_graph(
-        nodes=[mp_node], name="mp_graph", inputs=[inp], outputs=[outp]
-    )
+    graph = helper.make_graph(nodes=[mp_node], name="mp_graph", inputs=[inp], outputs=[outp])
 
-    model = helper.make_model(graph, producer_name="mp-model")
+    model = qonnx_make_model(graph, producer_name="mp-model")
     model = ModelWrapper(model)
 
     model.set_tensor_datatype("inp", idt)
@@ -89,12 +85,8 @@ def make_single_maxpool_modelwrapper(
 
 
 def make_single_quantavpool_modelwrapper(k, stride, ifm_ch, ifm_dim, ofm_dim, idt, odt):
-    inp = helper.make_tensor_value_info(
-        "inp", TensorProto.FLOAT, [1, ifm_ch, ifm_dim, ifm_dim]
-    )
-    outp = helper.make_tensor_value_info(
-        "outp", TensorProto.FLOAT, [1, ifm_ch, ofm_dim, ofm_dim]
-    )
+    inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, ifm_ch, ifm_dim, ifm_dim])
+    outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, ifm_ch, ofm_dim, ofm_dim])
 
     mp_node = helper.make_node(
         "QuantAvgPool2d",
@@ -108,11 +100,9 @@ def make_single_quantavpool_modelwrapper(k, stride, ifm_ch, ifm_dim, ofm_dim, id
         signed=1 if idt.signed() else 0,
         data_layout="NCHW",
     )
-    graph = helper.make_graph(
-        nodes=[mp_node], name="mp_graph", inputs=[inp], outputs=[outp]
-    )
+    graph = helper.make_graph(nodes=[mp_node], name="mp_graph", inputs=[inp], outputs=[outp])
 
-    model = helper.make_model(graph, producer_name="mp-model")
+    model = qonnx_make_model(graph, producer_name="mp-model")
     model = ModelWrapper(model)
 
     model.set_tensor_datatype("inp", idt)
@@ -143,9 +133,7 @@ def prepare_inputs(input_tensor):
 @pytest.mark.fpgadataflow
 @pytest.mark.slow
 @pytest.mark.vivado
-def test_convert_to_hls_pool_batch(
-    idt, odt, pool_config, ifm_ch, pe, op_type, exec_mode
-):
+def test_convert_to_hls_pool_batch(idt, odt, pool_config, ifm_ch, pe, op_type, exec_mode):
     k, stride, pad, ifm_dim = pool_config
 
     if ifm_ch % pe != 0:
@@ -184,9 +172,7 @@ def test_convert_to_hls_pool_batch(
 
         if idt.signed() != odt.signed():
             pytest.skip("Skipping QuantAvgPool2d with idt.signed() != odt.signed()")
-        model = make_single_quantavpool_modelwrapper(
-            k, stride, ifm_ch, ifm_dim, ofm_dim, idt, odt
-        )
+        model = make_single_quantavpool_modelwrapper(k, stride, ifm_ch, ifm_dim, ofm_dim, idt, odt)
     else:
         assert False, "{} is not a supported op_type".format(op_type)
 
@@ -209,18 +195,14 @@ def test_convert_to_hls_pool_batch(
         if pad == 0:
             assert len(new_model.graph.node) == 4
             assert new_model.graph.node[0].op_type == "Transpose"
-            assert new_model.graph.node[1].op_type.startswith(
-                "ConvolutionInputGenerator"
-            )
+            assert new_model.graph.node[1].op_type.startswith("ConvolutionInputGenerator")
             assert new_model.graph.node[2].op_type == "Pool_Batch"
             assert new_model.graph.node[3].op_type == "Transpose"
         else:
             assert len(new_model.graph.node) == 5
             assert new_model.graph.node[0].op_type == "Transpose"
             assert new_model.graph.node[1].op_type == "FMPadding_Batch"
-            assert new_model.graph.node[2].op_type.startswith(
-                "ConvolutionInputGenerator"
-            )
+            assert new_model.graph.node[2].op_type.startswith("ConvolutionInputGenerator")
             assert new_model.graph.node[3].op_type == "Pool_Batch"
             assert new_model.graph.node[4].op_type == "Transpose"
     else:
diff --git a/tests/fpgadataflow/test_depthwise_convolution.py b/tests/fpgadataflow/test_depthwise_convolution.py
index 5228ade3d0..2ffd696528 100644
--- a/tests/fpgadataflow/test_depthwise_convolution.py
+++ b/tests/fpgadataflow/test_depthwise_convolution.py
@@ -37,7 +37,11 @@
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.general import GiveUniqueNodeNames
 from qonnx.transformation.infer_shapes import InferShapes
-from qonnx.util.basic import calculate_signed_dot_prod_range, gen_finn_dt_tensor
+from qonnx.util.basic import (
+    calculate_signed_dot_prod_range,
+    gen_finn_dt_tensor,
+    qonnx_make_model,
+)
 
 import finn.core.onnx_exec as oxe
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
@@ -53,7 +57,6 @@
 
 
 def set_up_reference_model(act, idt, wdt, k, ifm_dim, ifm_ch, stride, padding):
-
     # set up reference model consisting of Im2Col + MatMul (+ MultiThreshold)
     ofm_ch = ifm_ch
     total_pad = 2 * padding
@@ -80,16 +83,10 @@ def set_up_reference_model(act, idt, wdt, k, ifm_dim, ifm_ch, stride, padding):
         )
 
     # set up onnx model
-    inp = oh.make_tensor_value_info(
-        "inp", TensorProto.FLOAT, [1, ifm_dim, ifm_dim, ifm_ch]
-    )
-    outp = oh.make_tensor_value_info(
-        "outp", TensorProto.FLOAT, [1, ofm_dim, ofm_dim, ofm_ch]
-    )
+    inp = oh.make_tensor_value_info("inp", TensorProto.FLOAT, [1, ifm_dim, ifm_dim, ifm_ch])
+    outp = oh.make_tensor_value_info("outp", TensorProto.FLOAT, [1, ofm_dim, ofm_dim, ofm_ch])
 
-    W_sparse = oh.make_tensor_value_info(
-        "W_sparse", TensorProto.FLOAT, [ifm_ch * k * k, ofm_ch]
-    )
+    W_sparse = oh.make_tensor_value_info("W_sparse", TensorProto.FLOAT, [ifm_ch * k * k, ofm_ch])
 
     im2col_node = oh.make_node(
         "Im2Col",
@@ -103,9 +100,7 @@ def set_up_reference_model(act, idt, wdt, k, ifm_dim, ifm_ch, stride, padding):
         depthwise=1,
     )
 
-    matmul_node = oh.make_node(
-        "MatMul", inputs=["im2col_out", "W_sparse"], outputs=["outp"]
-    )
+    matmul_node = oh.make_node("MatMul", inputs=["im2col_out", "W_sparse"], outputs=["outp"])
 
     if act is None:
         node_list = [im2col_node, matmul_node]
@@ -123,7 +118,7 @@ def set_up_reference_model(act, idt, wdt, k, ifm_dim, ifm_ch, stride, padding):
         outputs=[global_out],
         value_info=value_info,
     )
-    model = oh.make_model(graph, producer_name="lowered_dw_cnv-model")
+    model = qonnx_make_model(graph, producer_name="lowered_dw_cnv-model")
     model = ModelWrapper(model)
 
     # initialize model
diff --git a/tests/fpgadataflow/test_fifosizing.py b/tests/fpgadataflow/test_fifosizing.py
new file mode 100644
index 0000000000..f3716dea9b
--- /dev/null
+++ b/tests/fpgadataflow/test_fifosizing.py
@@ -0,0 +1,114 @@
+# Copyright (c) 2022 Xilinx, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of Xilinx nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+import pytest
+
+import json
+import shutil
+import torch
+from brevitas.export import export_qonnx
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.registry import getCustomOp
+
+import finn.builder.build_dataflow as build
+import finn.builder.build_dataflow_config as build_cfg
+from finn.util.basic import make_build_dir
+from finn.util.test import get_trained_network_and_ishape
+
+
+def fetch_test_model(topology, wbits=2, abits=2):
+    tmp_output_dir = make_build_dir("build_fifosizing_%s_" % topology)
+    (model, ishape) = get_trained_network_and_ishape(topology, wbits, abits)
+    chkpt_name = tmp_output_dir + "/model.onnx"
+    export_qonnx(model, torch.randn(ishape), chkpt_name)
+    return tmp_output_dir
+
+
+@pytest.mark.slow
+@pytest.mark.vivado
+@pytest.mark.fpgadataflow
+@pytest.mark.parametrize(
+    "method", ["largefifo_rtlsim_python", "largefifo_rtlsim_cpp", "characterize"]
+)
+@pytest.mark.parametrize("topology", ["tfc", "cnv"])
+def test_fifosizing_linear(method, topology):
+    force_python_rtlsim = "python" in method
+    method_key = "largefifo_rtlsim" if "largefifo_rtlsim" in method else "characterize"
+    tmp_output_dir = fetch_test_model(topology)
+    cfg = build_cfg.DataflowBuildConfig(
+        output_dir=tmp_output_dir,
+        auto_fifo_depths=True,
+        auto_fifo_strategy=method_key,
+        target_fps=10000 if topology == "tfc" else 1000,
+        force_python_rtlsim=force_python_rtlsim,
+        synth_clk_period_ns=10.0,
+        board="Pynq-Z1",
+        rtlsim_batch_size=100 if topology == "tfc" else 2,
+        shell_flow_type=build_cfg.ShellFlowType.VIVADO_ZYNQ,
+        generate_outputs=[
+            build_cfg.DataflowOutputType.ESTIMATE_REPORTS,
+            build_cfg.DataflowOutputType.STITCHED_IP,
+            build_cfg.DataflowOutputType.RTLSIM_PERFORMANCE,
+        ],
+        default_mem_mode=build_cfg.ComputeEngineMemMode.DECOUPLED,
+    )
+    build.build_dataflow_cfg(tmp_output_dir + "/model.onnx", cfg)
+    with open(tmp_output_dir + "/report/estimate_network_performance.json") as f:
+        est_data = json.load(f)
+    with open(tmp_output_dir + "/report/rtlsim_performance.json") as f:
+        sim_data = json.load(f)
+    assert (
+        float(sim_data["stable_throughput[images/s]"]) / float(est_data["estimated_throughput_fps"])
+        > 0.9
+    )
+    # now run the same build using the generated folding and FIFO config
+    tmp_output_dir_cmp = fetch_test_model(topology)
+    cfg_cmp = cfg
+    cfg_cmp.output_dir = tmp_output_dir_cmp
+    cfg_cmp.auto_fifo_depths = False
+    cfg_cmp.target_fps = None
+    cfg_cmp.generate_outputs = [build_cfg.DataflowOutputType.STITCHED_IP]
+    cfg_cmp.folding_config_file = tmp_output_dir + "/final_hw_config.json"
+    build.build_dataflow_cfg(tmp_output_dir_cmp + "/model.onnx", cfg_cmp)
+
+    model0 = ModelWrapper(tmp_output_dir + "/intermediate_models/step_create_stitched_ip.onnx")
+    model1 = ModelWrapper(tmp_output_dir_cmp + "/intermediate_models/step_create_stitched_ip.onnx")
+
+    assert len(model0.graph.node) == len(model1.graph.node)
+    for i in range(len(model0.graph.node)):
+        node0 = model0.graph.node[i]
+        node1 = model1.graph.node[i]
+        assert node0.op_type == node1.op_type
+        if node0.op_type == "StreamingFIFO":
+            node0_inst = getCustomOp(node0)
+            node1_inst = getCustomOp(node1)
+            assert node0_inst.get_nodeattr("depth") == node1_inst.get_nodeattr("depth")
+
+    shutil.rmtree(tmp_output_dir)
+    shutil.rmtree(tmp_output_dir_cmp)
diff --git a/tests/fpgadataflow/test_fpgadataflow_addstreams.py b/tests/fpgadataflow/test_fpgadataflow_addstreams.py
index 6d881f45b6..1ad2c26610 100644
--- a/tests/fpgadataflow/test_fpgadataflow_addstreams.py
+++ b/tests/fpgadataflow/test_fpgadataflow_addstreams.py
@@ -34,7 +34,7 @@
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.general import GiveUniqueNodeNames
-from qonnx.util.basic import gen_finn_dt_tensor
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
@@ -68,7 +68,7 @@ def make_addstreams_modelwrapper(ch, pe, idt):
         outputs=[outp],
     )
 
-    model = helper.make_model(graph, producer_name="addstreams-model")
+    model = qonnx_make_model(graph, producer_name="addstreams-model")
     model = ModelWrapper(model)
 
     model.set_tensor_datatype("inp1", idt)
diff --git a/tests/fpgadataflow/test_fpgadataflow_channelwise_ops.py b/tests/fpgadataflow/test_fpgadataflow_channelwise_ops.py
index ceafda90e5..186a6af42c 100644
--- a/tests/fpgadataflow/test_fpgadataflow_channelwise_ops.py
+++ b/tests/fpgadataflow/test_fpgadataflow_channelwise_ops.py
@@ -34,7 +34,7 @@
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.general import GiveUniqueNodeNames
-from qonnx.util.basic import gen_finn_dt_tensor
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
@@ -51,9 +51,7 @@ def make_modelwrapper(C, pe, idt, odt, pdt, func, vecs):
     NumChannels = C.shape[0]
 
     inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, vecs + [NumChannels])
-    outp = helper.make_tensor_value_info(
-        "outp", TensorProto.FLOAT, vecs + [NumChannels]
-    )
+    outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, vecs + [NumChannels])
 
     node_inp_list = ["inp", "const"]
 
@@ -73,7 +71,7 @@ def make_modelwrapper(C, pe, idt, odt, pdt, func, vecs):
     )
     graph = helper.make_graph(nodes=[node], name="graph", inputs=[inp], outputs=[outp])
 
-    model = helper.make_model(graph, producer_name="model")
+    model = qonnx_make_model(graph, producer_name="model")
     model = ModelWrapper(model)
 
     model.set_tensor_datatype("inp", idt)
diff --git a/tests/fpgadataflow/test_fpgadataflow_checksum.py b/tests/fpgadataflow/test_fpgadataflow_checksum.py
index 5e79ea2dad..403bb328ae 100644
--- a/tests/fpgadataflow/test_fpgadataflow_checksum.py
+++ b/tests/fpgadataflow/test_fpgadataflow_checksum.py
@@ -36,7 +36,7 @@
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
 from qonnx.transformation.infer_shapes import InferShapes
-from qonnx.util.basic import gen_finn_dt_tensor
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 from finn.core.rtlsim_exec import rtlsim_exec
@@ -115,7 +115,7 @@ def create_two_fc_model():
         value_info=[mid],
     )
 
-    model = helper.make_model(graph, producer_name="fclayer-model")
+    model = qonnx_make_model(graph, producer_name="fclayer-model")
     model = ModelWrapper(model)
 
     model.set_tensor_datatype("inp", idt)
@@ -133,6 +133,7 @@ def create_two_fc_model():
     return model
 
 
+@pytest.mark.vivado
 @pytest.mark.fpgadataflow
 def test_fpgadataflow_checksum():
     # use a graph consisting of two fc layers to test
@@ -214,11 +215,7 @@ def write_drain(sim):
     ), """The second checksums do not
         match in cppsim vs. rtlsim"""
 
-    assert (
-        checksum0_drain == 0
-    ), "Drain read doesn't match drain write for first checksum"
-    assert (
-        checksum1_drain == 0
-    ), "Drain read doesn't match drain write for second checksum"
+    assert checksum0_drain == 0, "Drain read doesn't match drain write for first checksum"
+    assert checksum1_drain == 0, "Drain read doesn't match drain write for second checksum"
 
     # TODO: test for drain set to true
diff --git a/tests/fpgadataflow/test_fpgadataflow_concat.py b/tests/fpgadataflow/test_fpgadataflow_concat.py
index dddc470ec2..2b2069a72b 100644
--- a/tests/fpgadataflow/test_fpgadataflow_concat.py
+++ b/tests/fpgadataflow/test_fpgadataflow_concat.py
@@ -72,6 +72,7 @@ def forward(self, *args):
 
 @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
 @pytest.mark.parametrize("idt", [DataType["INT4"]])
+@pytest.mark.fpgadataflow
 @pytest.mark.vivado
 @pytest.mark.slow
 def test_fpgadataflow_concat(exec_mode, idt):
@@ -94,6 +95,7 @@ def test_fpgadataflow_concat(exec_mode, idt):
     assert model.graph.node[0].op_type == "StreamingConcat"
     assert model.graph.node[0].domain == "finn.custom_op.fpgadataflow"
     if exec_mode == "cppsim":
+        model = model.transform(GiveUniqueNodeNames())
         model = model.transform(PrepareCppSim())
         model = model.transform(CompileCppSim())
         model = model.transform(SetExecMode("cppsim"))
@@ -107,6 +109,7 @@ def test_fpgadataflow_concat(exec_mode, idt):
     assert (exp_out == ret_sim[oname]).all()
 
 
+@pytest.mark.fpgadataflow
 @pytest.mark.vivado
 @pytest.mark.slow
 def test_fpgadataflow_concat_stitchedip():
@@ -144,6 +147,5 @@ def test_fpgadataflow_concat_stitchedip():
     )
     model.set_metadata_prop("exec_mode", "rtlsim")
     model.set_metadata_prop("rtlsim_trace", "trace.vcd")
-    model.save("dbg.onnx")
     ret_sim = execute_onnx(model, inp_dict)
     assert (exp_out == ret_sim[oname]).all()
diff --git a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py
index a196ecbb61..d94b5d6399 100644
--- a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py
+++ b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py
@@ -34,7 +34,7 @@
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.general import GiveUniqueNodeNames
-from qonnx.util.basic import gen_finn_dt_tensor
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
@@ -46,13 +46,9 @@
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
 
 
-def make_single_im2col_modelwrapper(
-    k, ifm_ch, ifm_dim, ofm_dim, simd, stride, dilation, idt
-):
+def make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, simd, stride, dilation, idt):
     odt = idt
-    inp = helper.make_tensor_value_info(
-        "inp", TensorProto.FLOAT, [1, ifm_dim, ifm_dim, ifm_ch]
-    )
+    inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, ifm_dim, ifm_dim, ifm_ch])
     outp = helper.make_tensor_value_info(
         "outp", TensorProto.FLOAT, [1, ofm_dim, ofm_dim, k * k * ifm_ch]
     )
@@ -73,7 +69,7 @@ def make_single_im2col_modelwrapper(
         nodes=[im2col_node], name="im2col_graph", inputs=[inp], outputs=[outp]
     )
 
-    model = helper.make_model(graph, producer_name="im2col-model")
+    model = qonnx_make_model(graph, producer_name="im2col-model")
     model = ModelWrapper(model)
 
     model.set_tensor_datatype("inp", idt)
@@ -86,9 +82,7 @@ def make_single_slidingwindow_modelwrapper(
     k, ifm_ch, ifm_dim, ofm_dim, simd, stride, dilation, idt, dw=0
 ):
     odt = idt
-    inp = helper.make_tensor_value_info(
-        "inp", TensorProto.FLOAT, [1, ifm_dim, ifm_dim, ifm_ch]
-    )
+    inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, ifm_dim, ifm_dim, ifm_ch])
     outp = helper.make_tensor_value_info(
         "outp", TensorProto.FLOAT, [1, ofm_dim, ofm_dim, k * k * ifm_ch]
     )
@@ -117,7 +111,7 @@ def make_single_slidingwindow_modelwrapper(
         outputs=[outp],
     )
 
-    model = helper.make_model(graph, producer_name="slidingwindow-model")
+    model = qonnx_make_model(graph, producer_name="slidingwindow-model")
     model = ModelWrapper(model)
 
     model.set_tensor_datatype("inp", idt)
@@ -152,9 +146,7 @@ def prepare_inputs(input_tensor):
 @pytest.mark.fpgadataflow
 @pytest.mark.slow
 @pytest.mark.vivado
-def test_fpgadataflow_slidingwindow(
-    idt, k, ifm_dim, ifm_ch, stride, dilation, exec_mode, simd, dw
-):
+def test_fpgadataflow_slidingwindow(idt, k, ifm_dim, ifm_ch, stride, dilation, exec_mode, simd, dw):
     ofm_dim = int(((ifm_dim - k) / stride) + 1)
 
     x = gen_finn_dt_tensor(idt, (1, ifm_dim, ifm_dim, ifm_ch))
@@ -187,9 +179,7 @@ def test_fpgadataflow_slidingwindow(
     if dw == 0:
         assert (y_produced == y_expected).all()
     else:
-        y_expected = y_expected.reshape(
-            1, ofm_dim, ofm_dim, k * k, ifm_ch // simd, simd
-        )
+        y_expected = y_expected.reshape(1, ofm_dim, ofm_dim, k * k, ifm_ch // simd, simd)
         y_expected = y_expected.transpose(0, 1, 2, 4, 3, 5)
         y_expected = y_expected.reshape(1, ofm_dim, ofm_dim, ifm_ch * k * k)
         assert (y_produced == y_expected).all()
diff --git a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator1d.py b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator1d.py
index 0fc3ca82cf..aa89dde5e7 100644
--- a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator1d.py
+++ b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator1d.py
@@ -35,7 +35,7 @@
 from qonnx.custom_op.general.im2col import compute_conv_output_dim
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.general import GiveUniqueNodeNames
-from qonnx.util.basic import gen_finn_dt_tensor
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
@@ -49,9 +49,7 @@
 fpga_part = "xczu3eg-sbva484-1-e"
 
 
-def make_single_im2col_modelwrapper(
-    k, ifm_ch, ifm_dim, ofm_dim, simd, stride, dilation, idt
-):
+def make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, simd, stride, dilation, idt):
     k_h, k_w = k
     ifm_dim_h, ifm_dim_w = ifm_dim
     stride_h, stride_w = stride
@@ -59,9 +57,7 @@ def make_single_im2col_modelwrapper(
     ofm_dim_h, ofm_dim_w = ofm_dim
 
     odt = idt
-    inp = helper.make_tensor_value_info(
-        "inp", TensorProto.FLOAT, [1, ifm_dim_h, ifm_dim_w, ifm_ch]
-    )
+    inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, ifm_dim_h, ifm_dim_w, ifm_ch])
     outp = helper.make_tensor_value_info(
         "outp", TensorProto.FLOAT, [1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch]
     )
@@ -82,7 +78,7 @@ def make_single_im2col_modelwrapper(
         nodes=[im2col_node], name="im2col_graph", inputs=[inp], outputs=[outp]
     )
 
-    model = helper.make_model(graph, producer_name="im2col-model")
+    model = qonnx_make_model(graph, producer_name="im2col-model")
     model = ModelWrapper(model)
 
     model.set_tensor_datatype("inp", idt)
@@ -101,9 +97,7 @@ def make_single_slidingwindow_modelwrapper(
     ofm_dim_h, ofm_dim_w = ofm_dim
 
     odt = idt
-    inp = helper.make_tensor_value_info(
-        "inp", TensorProto.FLOAT, [1, ifm_dim_h, ifm_dim_w, ifm_ch]
-    )
+    inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, ifm_dim_h, ifm_dim_w, ifm_ch])
     outp = helper.make_tensor_value_info(
         "outp", TensorProto.FLOAT, [1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch]
     )
@@ -133,7 +127,7 @@ def make_single_slidingwindow_modelwrapper(
         outputs=[outp],
     )
 
-    model = helper.make_model(graph, producer_name="slidingwindow-model")
+    model = qonnx_make_model(graph, producer_name="slidingwindow-model")
     model = ModelWrapper(model)
 
     model.set_tensor_datatype("inp", idt)
@@ -259,9 +253,7 @@ def test_fpgadataflow_slidingwindow_1d(
     if dw == 0:
         assert (y_produced == y_expected).all()
     else:
-        y_expected = y_expected.reshape(
-            1, ofm_dim_h, ofm_dim_w, k_h * k_w, ifm_ch // simd, simd
-        )
+        y_expected = y_expected.reshape(1, ofm_dim_h, ofm_dim_w, k_h * k_w, ifm_ch // simd, simd)
         y_expected = y_expected.transpose(0, 1, 2, 4, 3, 5)
         y_expected = y_expected.reshape(1, ofm_dim_h, ofm_dim_w, ifm_ch * k_h * k_w)
         assert (y_produced == y_expected).all()
diff --git a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl.py b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl.py
new file mode 100755
index 0000000000..53d7be0ebb
--- /dev/null
+++ b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl.py
@@ -0,0 +1,247 @@
+# Copyright (C) 2022, Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pytest
+
+from onnx import TensorProto, helper
+from qonnx.core.datatype import DataType
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.general.im2col import compute_conv_output_dim
+from qonnx.transformation.general import GiveUniqueNodeNames
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
+
+import finn.core.onnx_exec as oxe
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+
+
+def make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt):
+    k_h, k_w = k
+    ifm_dim_h, ifm_dim_w = ifm_dim
+    stride_h, stride_w = stride
+    dilation_h, dilation_w = dilation
+    ofm_dim_h, ofm_dim_w = ofm_dim
+
+    odt = idt
+    inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, ifm_dim_h, ifm_dim_w, ifm_ch])
+    outp = helper.make_tensor_value_info(
+        "outp", TensorProto.FLOAT, [1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch]
+    )
+
+    im2col_node = helper.make_node(
+        "Im2Col",
+        ["inp"],
+        ["outp"],
+        domain="finn.custom_op.general",
+        stride=[stride_h, stride_w],
+        kernel_size=[k_h, k_w],
+        input_shape=str((1, ifm_dim_h, ifm_dim_w, ifm_ch)),
+        dilations=[dilation_h, dilation_w],
+        pad_amount=[0, 0, 0, 0],
+        pad_value=0,
+    )
+    graph = helper.make_graph(
+        nodes=[im2col_node], name="im2col_graph", inputs=[inp], outputs=[outp]
+    )
+
+    model = qonnx_make_model(graph, producer_name="im2col-model")
+    model = ModelWrapper(model)
+
+    model.set_tensor_datatype("inp", idt)
+    model.set_tensor_datatype("outp", odt)
+
+    return model
+
+
+def make_single_slidingwindow_modelwrapper(
+    k, ifm_ch, ifm_dim, ofm_dim, simd, m, parallel_window, stride, dilation, idt, dw=0
+):
+    k_h, k_w = k
+    ifm_dim_h, ifm_dim_w = ifm_dim
+    stride_h, stride_w = stride
+    dilation_h, dilation_w = dilation
+    ofm_dim_h, ofm_dim_w = ofm_dim
+
+    odt = idt
+    inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, ifm_dim_h, ifm_dim_w, ifm_ch])
+    outp = helper.make_tensor_value_info(
+        "outp", TensorProto.FLOAT, [1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch]
+    )
+
+    SlidingWindow_node = helper.make_node(
+        "ConvolutionInputGenerator_rtl",
+        ["inp"],
+        ["outp"],
+        domain="finn.custom_op.fpgadataflow",
+        backend="fpgadataflow",
+        ConvKernelDim=[k_h, k_w],
+        IFMChannels=ifm_ch,
+        IFMDim=[ifm_dim_h, ifm_dim_w],
+        OFMDim=[ofm_dim_h, ofm_dim_w],
+        SIMD=simd,
+        M=m,
+        parallel_window=parallel_window,
+        Stride=[stride_h, stride_w],
+        Dilation=[dilation_h, dilation_w],
+        inputDataType=idt.name,
+        outputDataType=odt.name,
+        depthwise=dw,
+    )
+    graph = helper.make_graph(
+        nodes=[SlidingWindow_node],
+        name="slidingwindow_graph",
+        inputs=[inp],
+        outputs=[outp],
+    )
+
+    model = qonnx_make_model(graph, producer_name="slidingwindow-model")
+    model = ModelWrapper(model)
+
+    model.set_tensor_datatype("inp", idt)
+    model.set_tensor_datatype("outp", odt)
+
+    return model
+
+
+def prepare_inputs(input_tensor):
+    return {"inp": input_tensor}
+
+
+# input datatype
+@pytest.mark.parametrize("idt", [DataType["UINT4"]])
+# kernel size
+@pytest.mark.parametrize("k", [[3, 3], [1, 5]])
+# input dimension
+@pytest.mark.parametrize("ifm_dim", [[13, 13], [1, 21]])
+# input channels
+@pytest.mark.parametrize("ifm_ch", [6])
+# Stride
+@pytest.mark.parametrize("stride", [[1, 1], [2, 2]])
+# Dilation
+@pytest.mark.parametrize("dilation", [[1, 1], [2, 2]])
+# depthwise
+@pytest.mark.parametrize("dw", [0, 1])
+# input channel parallelism ("SIMD")
+@pytest.mark.parametrize("simd", [1, 3, 6])
+# parallel_window enable (MMV_out = M*K)
+@pytest.mark.parametrize("parallel_window", [0, 1])
+# in/out MMV ("M")
+@pytest.mark.parametrize("m", [1])
+# Flip dimensions
+@pytest.mark.parametrize("flip", [False])
+@pytest.mark.slow
+@pytest.mark.vivado
+@pytest.mark.fpgadataflow
+def test_fpgadataflow_slidingwindow_rtl(
+    idt, k, ifm_dim, ifm_ch, stride, dilation, dw, simd, m, parallel_window, flip
+):
+    if flip:
+        if (
+            ifm_dim[0] == ifm_dim[1]
+            and k[0] == k[1]
+            and stride[0] == stride[1]
+            and dilation[0] == dilation[1]
+        ):
+            pytest.skip("Dimension flip would have no effect")
+        k = k[::-1]
+        ifm_dim = ifm_dim[::-1]
+        stride = stride[::-1]
+        dilation = dilation[::-1]
+
+    k_h, k_w = k
+    ifm_dim_h, ifm_dim_w = ifm_dim
+    stride_h, stride_w = stride
+    dilation_h, dilation_w = dilation
+
+    kernel_width = (k_w - 1) * dilation_w + 1  # incl. dilation
+    kernel_height = (k_h - 1) * dilation_h + 1  # incl. dilation
+
+    if simd > ifm_ch:
+        pytest.skip("SIMD cannot be larger than number of input channels")
+    if ifm_ch % simd != 0:
+        pytest.skip("SIMD must divide number of input channels")
+    if kernel_height > ifm_dim_h or stride_h > ifm_dim_h:
+        pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+    if kernel_width > ifm_dim_w or stride_w > ifm_dim_w:
+        pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+    if (k_h == 1 and dilation_h != 1) or (k_w == 1 and dilation_w != 1):
+        pytest.skip("Illegal convolution configuration: dilation for unitary kernel dim")
+    if (stride_h > k_h) or (stride_w > k_w) and not parallel_window:
+        pytest.skip("Not all combinations for stride > k edge case supported in default mode")
+    if k_h == 1 and k_w == 1 and simd != ifm_ch:
+        pytest.skip("1x1 Kernel only supported in parallel mode (SIMD=C)")
+    if parallel_window and simd != ifm_ch:
+        pytest.skip("Parallel window requires SIMD=C")
+
+    ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h)
+    ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w)
+    ofm_dim = [ofm_dim_h, ofm_dim_w]
+
+    x = gen_finn_dt_tensor(idt, (1, ifm_dim_h, ifm_dim_w, ifm_ch))
+    model = make_single_slidingwindow_modelwrapper(
+        k=k,
+        ifm_ch=ifm_ch,
+        ifm_dim=ifm_dim,
+        ofm_dim=ofm_dim,
+        simd=simd,
+        m=m,
+        parallel_window=parallel_window,
+        stride=stride,
+        dilation=dilation,
+        idt=idt,
+        dw=dw,
+    )
+
+    model = model.transform(SetExecMode("rtlsim"))
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(PrepareIP("xc7z020clg400-1", 5))
+    model = model.transform(PrepareRTLSim())
+
+    # prepare input data
+    input_dict = prepare_inputs(x)
+    # execute model
+    y_produced = oxe.execute_onnx(model, input_dict)["outp"]
+    golden = make_single_im2col_modelwrapper(
+        k=k,
+        ifm_ch=ifm_ch,
+        ifm_dim=ifm_dim,
+        ofm_dim=ofm_dim,
+        stride=stride,
+        dilation=dilation,
+        idt=idt,
+    )
+    y_expected = oxe.execute_onnx(golden, input_dict)["outp"]
+
+    if dw == 0:
+        assert (y_produced == y_expected).all()
+    else:
+        y_expected = y_expected.reshape(1, ofm_dim_h, ofm_dim_w, k_h * k_w, ifm_ch // simd, simd)
+        y_expected = y_expected.transpose(0, 1, 2, 4, 3, 5)
+        y_expected = y_expected.reshape(1, ofm_dim_h, ofm_dim_w, ifm_ch * k_h * k_w)
+        assert (y_produced == y_expected).all()
diff --git a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl_dynamic.py b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl_dynamic.py
new file mode 100644
index 0000000000..f5a06316e2
--- /dev/null
+++ b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl_dynamic.py
@@ -0,0 +1,594 @@
+# Copyright (c) 2022, Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pytest
+
+import copy
+import numpy as np
+import onnx.parser as oprs
+import os
+from onnx import TensorProto, helper
+from pyverilator.util.axi_utils import axilite_write, reset_rtlsim
+from qonnx.core.datatype import DataType
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.general.im2col import compute_conv_output_dim
+from qonnx.custom_op.registry import getCustomOp
+from qonnx.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
+from qonnx.transformation.infer_datatypes import InferDataTypes
+from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.transformation.lower_convs_to_matmul import (
+    LowerConvsToMatMul,
+    _auto_pad_to_explicit_padding,
+)
+from qonnx.util.basic import gen_finn_dt_tensor, get_by_name, qonnx_make_model
+
+import finn.core.onnx_exec as oxe
+import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
+import finn.transformation.streamline.absorb as absorb
+from finn.core.onnx_exec import execute_onnx
+from finn.core.rtlsim_exec import rtlsim_exec
+from finn.transformation.fpgadataflow.create_dataflow_partition import (
+    CreateDataflowPartition,
+)
+from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
+from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.insert_dwc import InsertDWC
+from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.util.basic import pyverilate_get_liveness_threshold_cycles
+
+
+def create_conv_model(idim_h, idim_w, ifm, k, stride, ofm, idt, wdt, pad_mode, depthwise):
+    np.random.seed(0)
+    group = ifm if depthwise else 1
+    group_str = str(group)
+    ishp = (1, ifm, idim_h, idim_w)
+    pad_0 = _auto_pad_to_explicit_padding(pad_mode, idim_h, idim_w, k, k, stride, stride, 2)
+    int_dim_h = compute_conv_output_dim(idim_h, k, stride, total_pad=pad_0[0] + pad_0[2])
+    int_dim_w = compute_conv_output_dim(idim_w, k, stride, total_pad=pad_0[1] + pad_0[3])
+
+    pad_1 = _auto_pad_to_explicit_padding(pad_mode, int_dim_h, int_dim_w, k, k, stride, stride, 2)
+    odim_h = compute_conv_output_dim(int_dim_h, k, stride, total_pad=pad_1[0] + pad_1[2])
+    odim_w = compute_conv_output_dim(int_dim_w, k, stride, total_pad=pad_1[1] + pad_1[3])
+    oshp = (1, ifm, odim_h, odim_w) if depthwise else (1, ofm, odim_h, odim_w)
+    wshp = (ifm, 1, k, k) if depthwise else (ofm, ifm, k, k)
+    wshp_1 = (ifm, 1, k, k) if depthwise else (ofm, ofm, k, k)
+    ishp_str = str(list(ishp))
+    oshp_str = str(list(oshp))
+    wshp_str = str(list(wshp))
+    wshp_1_str = str(list(wshp_1))
+    kshp_str = str([k, k])
+    pad_0_str = str(list(pad_0))
+    pad_1_str = str(list(pad_1))
+    stride_str = str([stride, stride])
+    dil_str = str([1, 1])
+
+    input = f"""
+    <
+        ir_version: 7,
+        opset_import: ["" : 9]
+    >
+    agraph (float{ishp_str} in0) => (float{oshp_str} out0)
+    <
+        float{wshp_str} param_c0_weight,
+        float{wshp_1_str} param_c1_weight
+    >
+    {{
+        conv0 = Conv<
+                dilations={dil_str},group={group_str},kernel_shape={kshp_str},pads={pad_0_str},
+                strides={stride_str}
+            >(in0, param_c0_weight)
+        out0 = Conv<
+                dilations={dil_str},group={group_str},kernel_shape={kshp_str},pads={pad_1_str},
+                strides={stride_str}
+            >(conv0, param_c1_weight)
+    }}
+    """
+    model = oprs.parse_model(input)
+    model = ModelWrapper(model)
+    model = model.transform(InferShapes())
+    model = model.transform(InferDataTypes())
+    model.set_tensor_datatype("in0", idt)
+    model.set_tensor_datatype("param_c0_weight", wdt)
+    model.set_tensor_datatype("param_c1_weight", wdt)
+    model.set_initializer("param_c0_weight", gen_finn_dt_tensor(wdt, wshp))
+    model.set_initializer("param_c1_weight", gen_finn_dt_tensor(wdt, wshp_1))
+    return model
+
+
+def update_conv_model_dims(model, idim_new_h, idim_new_w):
+    cnode = model.get_nodes_by_op_type("Conv")[0]
+    k, _ = get_by_name(cnode.attribute, "kernel_shape").ints
+    stride, _ = get_by_name(cnode.attribute, "strides").ints
+    ishp = model.get_tensor_shape("in0")
+    n, ci, _, _ = ishp
+    n, co, _, _ = model.get_tensor_shape("out0")
+    int_dim_h = compute_conv_output_dim(idim_new_h, k, stride)
+    int_dim_w = compute_conv_output_dim(idim_new_w, k, stride)
+    odim_h = compute_conv_output_dim(int_dim_h, k, stride)
+    odim_w = compute_conv_output_dim(int_dim_w, k, stride)
+    model.set_tensor_shape("in0", (n, ci, idim_new_h, idim_new_w))
+    model.set_tensor_shape("out0", (n, co, odim_h, odim_w))
+    # remove all existing shapes
+    del model.graph.value_info[:]
+    model = model.transform(InferShapes())
+    model = model.transform(InferDataTypes())
+    return model
+
+
+# Helper function to update tensor dimensions manually because shape inference
+# does not work on FINN nodes (they assume well-defined tensor shapes).
+def update_tensor_dim(model, tensor_name, new_hw):
+    shape = model.get_tensor_shape(tensor_name)
+    shape[1] = new_hw[0]
+    shape[2] = new_hw[1]
+    model.set_tensor_shape(tensor_name, shape)
+
+
+# Helper function that delivers the hook to program the SWG via AXI-Lite
+def config_hook(configs):
+    if configs is None:
+        return None
+
+    def write_swg_config(sim):
+        reset_rtlsim(sim)
+        for axi_name, config in configs:
+            # Write config registers to the SWG/FMPadding dict
+            # defines (addr, value) tuples
+            for config_entry in config.values():
+                axilite_write(sim, config_entry[0], config_entry[1], basename=axi_name)
+        reset_rtlsim(sim)
+
+    return write_swg_config
+
+
+cfg0 = {
+    "idims": [(32, 32), (8, 8)],
+    "ifm": 64,
+    "k": 3,
+    "stride": 1,
+    "ofm": 64,
+    "depthwise": True,
+    "pad_mode": "SAME_UPPER",
+    # run synthesis for one configuration
+    # this helped expose a bug in enum decls previously
+    # (which config the synth runs on does not matter)
+    "do_synth": True,
+}
+cfg1 = {
+    "idims": [(32, 16), (16, 8)],
+    "ifm": 4,
+    "k": 4,
+    "stride": 1,
+    "ofm": 8,
+    "depthwise": False,
+    "pad_mode": "SAME_UPPER",
+    "do_synth": False,
+}
+cfg2 = {
+    "idims": [(64, 128), (2, 4)],
+    "ifm": 64,
+    "k": 3,
+    "stride": 1,
+    "ofm": 64,
+    "depthwise": True,
+    "pad_mode": "SAME_UPPER",
+    "do_synth": False,
+}
+
+
+@pytest.mark.parametrize("cfg", [cfg0, cfg1, cfg2])
+@pytest.mark.slow
+@pytest.mark.vivado
+@pytest.mark.fpgadataflow
+def test_fpgadataflow_conv_dynamic(cfg):
+    do_synth = cfg["do_synth"]
+    pad_mode = cfg["pad_mode"]
+    depthwise = cfg["depthwise"]
+    idims = cfg["idims"]
+    ifm = cfg["ifm"]
+    k = cfg["k"]
+    stride = cfg["stride"]
+    ofm = cfg["ofm"]
+    idt = DataType["UINT4"]
+    wdt = DataType["INT2"]
+    exp_cfgs = []
+    largest_model = None
+    for idim in idims:
+        idim_h, idim_w = idim
+        ishp = (1, ifm, idim_h, idim_w)
+        np.random.seed(0)
+        inp = gen_finn_dt_tensor(idt, ishp)
+        model = create_conv_model(
+            idim_h, idim_w, ifm, k, stride, ofm, idt, wdt, pad_mode, depthwise
+        )
+        _, _, int_dim_h, int_dim_w = model.get_tensor_shape("conv0")
+        _, _, odim_h, odim_w = model.get_tensor_shape("out0")
+        pad0 = get_by_name(model.graph.node[0].attribute, "pads").ints
+        pad1 = get_by_name(model.graph.node[1].attribute, "pads").ints
+        if idim == max(idims):
+            # use largest model for hardware conversion
+            largest_model = copy.deepcopy(model)
+        golden = execute_onnx(model, {"in0": inp})["out0"]
+        exp_cfg = (
+            (idim_h, idim_w),
+            (int_dim_h, int_dim_w),
+            (odim_h, odim_w),
+            pad0,
+            pad1,
+            inp,
+            golden,
+        )
+        exp_cfgs.append(exp_cfg)
+
+    # convert to hardware and prepare simulation
+    model = largest_model.transform(LowerConvsToMatMul())
+    model = model.transform(to_hls.InferConvInpGen(use_rtl_variant=True))
+    model = model.transform(to_hls.InferQuantizedMatrixVectorActivation(mem_mode="decoupled"))
+    model = model.transform(to_hls.InferVectorVectorActivation())
+    model = model.transform(absorb.AbsorbConsecutiveTransposes())
+    parent_model = model.transform(CreateDataflowPartition())
+    sdp_inst = getCustomOp(parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0])
+    model = ModelWrapper(sdp_inst.get_nodeattr("model"))
+    assert len(model.get_nodes_by_op_type("ConvolutionInputGenerator_rtl")) == 2
+    if pad_mode == "VALID":
+        assert len(model.get_nodes_by_op_type("FMPadding_rtl")) == 0
+    else:
+        assert len(model.get_nodes_by_op_type("FMPadding_rtl")) == 2
+    dyn_nodes = model.get_nodes_by_op_type("ConvolutionInputGenerator_rtl")
+    dyn_nodes += model.get_nodes_by_op_type("FMPadding_rtl")
+    for swg_node in dyn_nodes:
+        getCustomOp(swg_node).set_nodeattr("SIMD", 4)
+        getCustomOp(swg_node).set_nodeattr("dynamic_mode", 1)
+        getCustomOp(swg_node).set_nodeattr("inFIFODepths", [16])
+        getCustomOp(swg_node).set_nodeattr("outFIFODepths", [16])
+    comp_nodes = model.get_nodes_by_op_type("MatrixVectorActivation")
+    comp_nodes += model.get_nodes_by_op_type("VectorVectorActivation")
+    for comp_node in comp_nodes:
+        if depthwise:
+            getCustomOp(comp_node).set_nodeattr("PE", 4)
+        else:
+            getCustomOp(comp_node).set_nodeattr("SIMD", 4)
+            getCustomOp(comp_node).set_nodeattr("PE", 4)
+    model = model.transform(InsertDWC())
+    model = model.transform(InsertFIFO(create_shallow_fifos=True))
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(GiveReadableTensorNames())
+    model = model.transform(PrepareIP("xc7z020clg400-1", 5))
+    model = model.transform(HLSSynthIP())
+    model = model.transform(CreateStitchedIP("xc7z020clg400-1", 5, vitis=do_synth))
+    model.set_metadata_prop("exec_mode", "rtlsim")
+
+    # loop through experiment configurations
+    for exp_cfg in exp_cfgs:
+        (
+            (idim_h, idim_w),
+            (int_dim_h, int_dim_w),
+            (odim_h, odim_w),
+            pad0,
+            pad1,
+            inp,
+            golden,
+        ) = exp_cfg
+        conv0_idim_h = idim_h + pad0[0] + pad0[2]
+        conv0_idim_w = idim_w + pad0[1] + pad0[3]
+        conv1_idim_h = int_dim_h + pad1[0] + pad1[2]
+        conv1_idim_w = int_dim_w + pad1[1] + pad1[3]
+        # get config for the new dimensions
+        swg_nodes = model.get_nodes_by_op_type("ConvolutionInputGenerator_rtl")
+        swg0 = getCustomOp(swg_nodes[0])
+        update_tensor_dim(model, swg0.onnx_node.input[0], (conv0_idim_h, conv0_idim_w))
+        update_tensor_dim(model, swg0.onnx_node.output[0], (int_dim_h, int_dim_w))
+        swg_config0 = swg0.get_dynamic_config((conv0_idim_h, conv0_idim_w))
+        swg1 = getCustomOp(swg_nodes[1])
+        update_tensor_dim(model, swg1.onnx_node.input[0], (conv1_idim_h, conv1_idim_w))
+        update_tensor_dim(model, swg1.onnx_node.output[0], (odim_h, odim_w))
+        swg_config1 = swg1.get_dynamic_config((conv1_idim_h, conv1_idim_w))
+        if pad_mode != "VALID":
+            pad_nodes = model.get_nodes_by_op_type("FMPadding_rtl")
+            padder0 = getCustomOp(pad_nodes[0])
+            update_tensor_dim(model, padder0.onnx_node.input[0], (idim_h, idim_w))
+            update_tensor_dim(model, padder0.onnx_node.output[0], (conv0_idim_h, conv0_idim_w))
+            pad_config0 = padder0.get_dynamic_config((idim_h, idim_w), pad0)
+            padder1 = getCustomOp(pad_nodes[1])
+            update_tensor_dim(model, padder1.onnx_node.input[0], (int_dim_h, int_dim_w))
+            update_tensor_dim(model, padder1.onnx_node.output[0], (conv1_idim_h, conv1_idim_w))
+            pad_config1 = padder1.get_dynamic_config((int_dim_h, int_dim_w), pad1)
+            configs = [
+                ("s_axilite_0_", pad_config0),
+                ("s_axilite_1_", swg_config0),
+                ("s_axilite_2_", pad_config1),
+                ("s_axilite_3_", swg_config1),
+            ]
+        else:
+            configs = [("s_axilite_0_", swg_config0), ("s_axilite_1_", swg_config1)]
+        # adjust folded shapes for I/O FIFOs
+        # (since rtlsim_exec uses folded shape info to fold global i/o tensors)
+        first_node = getCustomOp(model.graph.node[0])
+        first_node_shp = list(first_node.get_folded_input_shape())
+        first_node_shp[1] = idim_h
+        first_node_shp[2] = idim_w
+        first_node.set_nodeattr("folded_shape", first_node_shp)
+        update_tensor_dim(model, first_node.onnx_node.input[0], (idim_h, idim_w))
+        last_node = getCustomOp(model.graph.node[-1])
+        last_node_shp = list(last_node.get_folded_output_shape())
+        last_node_shp[1] = odim_h
+        last_node_shp[2] = odim_w
+        update_tensor_dim(model, last_node.onnx_node.output[0], (odim_h, odim_w))
+        last_node.set_nodeattr("folded_shape", last_node_shp)
+        ctx = {"global_in": inp.transpose(0, 2, 3, 1)}
+        liveness_prev = pyverilate_get_liveness_threshold_cycles()
+        os.environ["LIVENESS_THRESHOLD"] = "100000"
+        rtlsim_exec(model, ctx, pre_hook=config_hook(configs))
+        os.environ["LIVENESS_THRESHOLD"] = str(liveness_prev)
+        ret = ctx["global_out"].transpose(0, 3, 1, 2)
+        assert np.isclose(golden, ret).all()
+
+
+def make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt):
+    k_h, k_w = k
+    ifm_dim_h, ifm_dim_w = ifm_dim
+    stride_h, stride_w = stride
+    dilation_h, dilation_w = dilation
+    ofm_dim_h, ofm_dim_w = ofm_dim
+
+    odt = idt
+    inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, ifm_dim_h, ifm_dim_w, ifm_ch])
+    outp = helper.make_tensor_value_info(
+        "outp", TensorProto.FLOAT, [1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch]
+    )
+
+    im2col_node = helper.make_node(
+        "Im2Col",
+        ["inp"],
+        ["outp"],
+        domain="finn.custom_op.general",
+        stride=[stride_h, stride_w],
+        kernel_size=[k_h, k_w],
+        input_shape=str((1, ifm_dim_h, ifm_dim_w, ifm_ch)),
+        dilations=[dilation_h, dilation_w],
+        pad_amount=[0, 0, 0, 0],
+        pad_value=0,
+    )
+    graph = helper.make_graph(
+        nodes=[im2col_node], name="im2col_graph", inputs=[inp], outputs=[outp]
+    )
+
+    model = qonnx_make_model(graph, producer_name="im2col-model")
+    model = ModelWrapper(model)
+
+    model.set_tensor_datatype("inp", idt)
+    model.set_tensor_datatype("outp", odt)
+
+    return model
+
+
+def make_single_slidingwindow_modelwrapper(
+    k, ifm_ch, ifm_dim, ofm_dim, simd, m, parallel_window, stride, dilation, idt, dw=0
+):
+    k_h, k_w = k
+    ifm_dim_h, ifm_dim_w = ifm_dim
+    stride_h, stride_w = stride
+    dilation_h, dilation_w = dilation
+    ofm_dim_h, ofm_dim_w = ofm_dim
+
+    odt = idt
+    inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, ifm_dim_h, ifm_dim_w, ifm_ch])
+    outp = helper.make_tensor_value_info(
+        "outp", TensorProto.FLOAT, [1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch]
+    )
+
+    SlidingWindow_node = helper.make_node(
+        "ConvolutionInputGenerator_rtl",
+        ["inp"],
+        ["outp"],
+        domain="finn.custom_op.fpgadataflow",
+        backend="fpgadataflow",
+        ConvKernelDim=[k_h, k_w],
+        IFMChannels=ifm_ch,
+        IFMDim=[ifm_dim_h, ifm_dim_w],
+        OFMDim=[ofm_dim_h, ofm_dim_w],
+        SIMD=simd,
+        M=m,
+        parallel_window=parallel_window,
+        Stride=[stride_h, stride_w],
+        Dilation=[dilation_h, dilation_w],
+        inputDataType=idt.name,
+        outputDataType=odt.name,
+        depthwise=dw,
+        dynamic_mode=1,
+    )
+    graph = helper.make_graph(
+        nodes=[SlidingWindow_node],
+        name="slidingwindow_graph",
+        inputs=[inp],
+        outputs=[outp],
+    )
+
+    model = qonnx_make_model(graph, producer_name="slidingwindow-model")
+    model = ModelWrapper(model)
+
+    model.set_tensor_datatype("inp", idt)
+    model.set_tensor_datatype("outp", odt)
+
+    return model
+
+
+def prepare_inputs(input_tensor):
+    return {"inp": input_tensor}
+
+
+# input datatype
+@pytest.mark.parametrize("idt", [DataType["UINT4"]])
+# kernel size
+@pytest.mark.parametrize("k", [[3, 3]])
+# input dimension
+@pytest.mark.parametrize("ifm_dim_series", [[[32, 32], [16, 16], [8, 8]]])
+# input channels
+@pytest.mark.parametrize("ifm_ch", [6])
+# Stride
+@pytest.mark.parametrize("stride", [[1, 1]])
+# Dilation
+@pytest.mark.parametrize("dilation", [[1, 1]])
+# depthwise
+@pytest.mark.parametrize("dw", [0, 1])
+# input channel parallelism ("SIMD")
+@pytest.mark.parametrize("simd", [2, 6])
+# parallel_window enable (MMV_out = M*K)
+@pytest.mark.parametrize("parallel_window", [0])
+# in/out MMV ("M")
+@pytest.mark.parametrize("m", [1])
+@pytest.mark.slow
+@pytest.mark.vivado
+@pytest.mark.fpgadataflow
+def test_fpgadataflow_slidingwindow_rtl_dynamic(
+    idt, k, ifm_dim_series, ifm_ch, stride, dilation, dw, simd, m, parallel_window
+):
+    # Begin test by generating RTL SWG normally for the first FM of the series.
+    # The following FM dimensions must be equal or smaller than the initial
+    # dimensions (in terms of required buffer depth).
+    ifm_dim = ifm_dim_series[0]
+
+    k_h, k_w = k
+    ifm_dim_h, ifm_dim_w = ifm_dim
+    stride_h, stride_w = stride
+    dilation_h, dilation_w = dilation
+    ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h)
+    ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w)
+    ofm_dim = [ofm_dim_h, ofm_dim_w]
+    kernel_width = (k_w - 1) * dilation_w + 1  # incl. dilation
+    kernel_height = (k_h - 1) * dilation_h + 1  # incl. dilation
+
+    if simd > ifm_ch:
+        pytest.skip("SIMD cannot be larger than number of input channels")
+    if ifm_ch % simd != 0:
+        pytest.skip("SIMD must divide number of input channels")
+    if kernel_height > ifm_dim_h or stride_h > ifm_dim_h:
+        pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+    if kernel_width > ifm_dim_w or stride_w > ifm_dim_w:
+        pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+    if (k_h == 1 and (stride_h != 1 or dilation_h != 1)) or (
+        k_w == 1 and (stride_w != 1 or dilation_w != 1)
+    ):
+        pytest.skip(
+            """Illegal convolution configuration:
+            stride or dilation defined for unitary kernel dim"""
+        )
+    if k_h == 1 and k_w == 1 and simd != ifm_ch:
+        pytest.skip("1x1 Kernel only supported in parallel mode (SIMD=C)")
+    if parallel_window and simd != ifm_ch:
+        pytest.skip("Parallel window requires SIMD=C")
+
+    model = make_single_slidingwindow_modelwrapper(
+        k=k,
+        ifm_ch=ifm_ch,
+        ifm_dim=ifm_dim,
+        ofm_dim=ofm_dim,
+        simd=simd,
+        m=m,
+        parallel_window=parallel_window,
+        stride=stride,
+        dilation=dilation,
+        idt=idt,
+        dw=dw,
+    )
+
+    # Simulate using stitched-ip-rtlsim so we can use existing infrastructure
+    # that supports hook functions to re-program configuration before rtlsim
+    model = model.transform(InsertFIFO(True))  # required for proper simulation
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(PrepareIP("xc7z020clg400-1", 5))
+    model = model.transform(HLSSynthIP())
+    model = model.transform(CreateStitchedIP("xc7z020clg400-1", 5))
+    model.set_metadata_prop("exec_mode", "rtlsim")
+
+    # Simulate 1 FM for each dimension in the series
+    for i, ifm_dim in enumerate(ifm_dim_series):
+        ifm_dim_h, ifm_dim_w = ifm_dim
+        ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h)
+        ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w)
+        ofm_dim = [ofm_dim_h, ofm_dim_w]
+
+        configs = None
+        if i > 0:  # skip re-programming for initial FM dimension
+            # Necessary update of node and tensor attributes to make rtlsim work:
+            swg_node = model.get_nodes_by_op_type("ConvolutionInputGenerator_rtl")[0]
+            swg_inst = getCustomOp(swg_node)
+            update_tensor_dim(model, swg_node.input[0], ifm_dim)
+            update_tensor_dim(model, swg_node.output[0], ofm_dim)
+
+            # Generate config, also overwrites IFMDim/OFMDim attributes:
+            config = swg_inst.get_dynamic_config(ifm_dim)
+            configs = [("s_axilite_0_", config)]
+
+            # Also update FIFO nodes and corresponding tensors
+            fifo_node = model.get_nodes_by_op_type("StreamingFIFO")[0]
+            fifo_inst = getCustomOp(fifo_node)
+            shape = fifo_inst.get_nodeattr("folded_shape")
+            shape[1] = ifm_dim_h
+            shape[2] = ifm_dim_w
+            fifo_inst.set_nodeattr("folded_shape", shape)
+            update_tensor_dim(model, fifo_node.input[0], ifm_dim)
+
+            fifo_node = model.get_nodes_by_op_type("StreamingFIFO")[1]
+            fifo_inst = getCustomOp(fifo_node)
+            shape = fifo_inst.get_nodeattr("folded_shape")
+            shape[1] = ofm_dim_h
+            shape[2] = ofm_dim_w
+            fifo_inst.set_nodeattr("folded_shape", shape)
+            update_tensor_dim(model, fifo_node.output[0], ofm_dim)
+
+        # Run rtlsim on stitched-ip
+        x = gen_finn_dt_tensor(idt, (1, ifm_dim_h, ifm_dim_w, ifm_ch))
+        context = prepare_inputs(x)
+        rtlsim_exec(model, context, pre_hook=config_hook(configs))
+        y_produced = context["outp"]
+
+        # Generate golden result
+        golden = make_single_im2col_modelwrapper(
+            k=k,
+            ifm_ch=ifm_ch,
+            ifm_dim=ifm_dim,
+            ofm_dim=ofm_dim,
+            stride=stride,
+            dilation=dilation,
+            idt=idt,
+        )
+        input_dict = prepare_inputs(x)
+        y_expected = oxe.execute_onnx(golden, input_dict)["outp"]
+
+        # Check result
+        if dw == 0:
+            assert (y_produced == y_expected).all()
+        else:
+            y_expected = y_expected.reshape(
+                1, ofm_dim_h, ofm_dim_w, k_h * k_w, ifm_ch // simd, simd
+            )
+            y_expected = y_expected.transpose(0, 1, 2, 4, 3, 5)
+            y_expected = y_expected.reshape(1, ofm_dim_h, ofm_dim_w, ifm_ch * k_h * k_w)
+            assert (y_produced == y_expected).all()
diff --git a/tests/fpgadataflow/test_fpgadataflow_downsampler.py b/tests/fpgadataflow/test_fpgadataflow_downsampler.py
new file mode 100644
index 0000000000..8a3c1fe682
--- /dev/null
+++ b/tests/fpgadataflow/test_fpgadataflow_downsampler.py
@@ -0,0 +1,158 @@
+# Copyright (c) 2022, Xilinx, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pytest
+
+import numpy as np
+import onnx.parser as oprs
+from qonnx.core.datatype import DataType
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.general.im2col import compute_conv_output_dim
+from qonnx.custom_op.registry import getCustomOp
+from qonnx.transformation.general import GiveUniqueNodeNames
+from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul
+from qonnx.util.basic import gen_finn_dt_tensor
+
+import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
+from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
+from finn.core.onnx_exec import execute_onnx
+from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
+from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+
+
+def build_model(is_1d, in_dim, k, stride, dt_in, dt_w, pad_half=0, flip_1d=False):
+    np.random.seed(0)
+    out_dim = compute_conv_output_dim(in_dim, k, stride, 2 * pad_half)
+    ifm = 8
+    ofm = 16
+    if is_1d:
+        if flip_1d:
+            shape_in = [1, ifm, 1, in_dim]
+            shape_out = [1, ofm, 1, out_dim]
+            shape_k = [1, k]
+            shape_s = [1, stride]
+            shape_p = [0, pad_half, 0, pad_half]
+        else:
+            shape_in = [1, ifm, in_dim, 1]
+            shape_out = [1, ofm, out_dim, 1]
+            shape_k = [k, 1]
+            shape_s = [stride, 1]
+            shape_p = [pad_half, 0, pad_half, 0]
+    else:
+        shape_in = [1, ifm, in_dim, in_dim]
+        shape_out = [1, ofm, out_dim, out_dim]
+        shape_k = [k, k]
+        shape_s = [stride, stride]
+        shape_p = [pad_half, pad_half, pad_half, pad_half]
+    shape_w = [ofm, ifm] + shape_k
+
+    sstr_in = str(shape_in)
+    sstr_out = str(shape_out)
+    sstr_k = str(shape_k)
+    sstr_s = str(shape_s)
+    sstr_p = str(shape_p)
+    sstr_w = str(shape_w)
+
+    input = f"""
+    <
+        ir_version: 7,
+        opset_import: ["" : 9]
+    >
+    agraph (float{sstr_in} in0) => (float{sstr_out} out0)
+    <
+        float{sstr_w} param_w_conv0
+    >
+    {{
+        out0 = Conv<kernel_shape={sstr_k}, group=1, pads={sstr_p},
+                    strides={sstr_s}>(in0, param_w_conv0)
+    }}
+    """
+    model = oprs.parse_model(input)
+    model = ModelWrapper(model)
+    model.set_tensor_datatype("in0", dt_in)
+    model.set_tensor_datatype("param_w_conv0", dt_w)
+    model.set_initializer("param_w_conv0", gen_finn_dt_tensor(dt_w, shape_w))
+    model = model.transform(InferShapes())
+    model = model.transform(LowerConvsToMatMul())
+    model = model.transform(InferShapes())
+    return model
+
+
+@pytest.mark.parametrize("is_1d", [True, False])
+@pytest.mark.parametrize("flip_1d", [True, False])
+@pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
+@pytest.mark.slow
+@pytest.mark.vivado
+@pytest.mark.fpgadataflow
+def test_fpgadataflow_downsampler(is_1d, flip_1d, exec_mode):
+    if flip_1d and not is_1d:
+        pytest.skip("flip_1d only applicable for is_1d")
+    in_dim = 32
+    k = 1
+    stride = 2
+    dt_in = DataType["UINT8"]
+    dt_w = DataType["INT2"]
+    model = build_model(is_1d, in_dim, k, stride, dt_in, dt_w, pad_half=0, flip_1d=flip_1d)
+    inp = gen_finn_dt_tensor(dt_in, model.get_tensor_shape("in0"))
+    idict = {"in0": inp}
+    y_expected = execute_onnx(model, idict)["out0"]
+    model = model.transform(to_hls.InferConvInpGen())
+    assert len(model.get_nodes_by_op_type("DownSampler")) == 1
+    if exec_mode == "cppsim":
+        model = model.transform(SetExecMode("cppsim"))
+        model = model.transform(PrepareCppSim())
+        model = model.transform(CompileCppSim())
+    elif exec_mode == "rtlsim":
+        model = model.transform(SetExecMode("rtlsim"))
+        model = model.transform(GiveUniqueNodeNames())
+        model = model.transform(PrepareIP("xc7z020clg400-1", 5))
+        model = model.transform(HLSSynthIP())
+        model = model.transform(PrepareRTLSim())
+    else:
+        raise Exception("Unknown exec_mode")
+    y_produced = execute_onnx(model, idict)["out0"]
+    assert (y_produced == y_expected).all()
+    if exec_mode == "rtlsim":
+        node = model.get_nodes_by_op_type("DownSampler")[0]
+        inst = getCustomOp(node)
+        cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim")
+        exp_cycles_dict = model.analysis(exp_cycles_per_layer)
+        exp_cycles = exp_cycles_dict[node.name]
+        # small adjustment for 2D testcase due to how rtlsim works:
+        # output is finished before all pixels are read, since last
+        # row is dropped (rtlsim finishes based on # of expected
+        # pixels)
+        if not is_1d:
+            exp_cycles = exp_cycles - in_dim
+        assert np.isclose(exp_cycles, cycles_rtlsim, atol=10)
+        assert exp_cycles != 0
diff --git a/tests/fpgadataflow/test_fpgadataflow_duplicatestreams.py b/tests/fpgadataflow/test_fpgadataflow_duplicatestreams.py
index 7ec254405d..27bab93fb6 100644
--- a/tests/fpgadataflow/test_fpgadataflow_duplicatestreams.py
+++ b/tests/fpgadataflow/test_fpgadataflow_duplicatestreams.py
@@ -36,7 +36,7 @@
 from qonnx.transformation.general import GiveUniqueNodeNames
 from qonnx.transformation.infer_datatypes import InferDataTypes
 from qonnx.transformation.infer_shapes import InferShapes
-from qonnx.util.basic import gen_finn_dt_tensor
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
@@ -56,9 +56,7 @@ def make_dupstreams_modelwrapper(ch, pe, idim, idt, n_dupl):
     for i in range(n_dupl):
         outp_name = "outp%d" % i
         out_names.append(outp_name)
-        out_vi.append(
-            helper.make_tensor_value_info(outp_name, TensorProto.FLOAT, shape)
-        )
+        out_vi.append(helper.make_tensor_value_info(outp_name, TensorProto.FLOAT, shape))
 
     dupstrm_node = helper.make_node(
         "DuplicateStreams_Batch",
@@ -72,11 +70,9 @@ def make_dupstreams_modelwrapper(ch, pe, idim, idt, n_dupl):
         inputDataType=idt.name,
         numInputVectors=[1, idim, idim],
     )
-    graph = helper.make_graph(
-        nodes=[dupstrm_node], name="graph", inputs=[inp], outputs=out_vi
-    )
+    graph = helper.make_graph(nodes=[dupstrm_node], name="graph", inputs=[inp], outputs=out_vi)
 
-    model = helper.make_model(graph, producer_name="addstreams-model")
+    model = qonnx_make_model(graph, producer_name="addstreams-model")
     model = ModelWrapper(model)
 
     model.set_tensor_datatype("inp", idt)
diff --git a/tests/fpgadataflow/test_fpgadataflow_dwc.py b/tests/fpgadataflow/test_fpgadataflow_dwc.py
index bcf2a1fe3d..eb6e0651d9 100644
--- a/tests/fpgadataflow/test_fpgadataflow_dwc.py
+++ b/tests/fpgadataflow/test_fpgadataflow_dwc.py
@@ -32,19 +32,18 @@
 from qonnx.core.datatype import DataType
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.transformation.general import GiveUniqueNodeNames
-from qonnx.util.basic import gen_finn_dt_tensor
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 
 import finn.core.onnx_exec as oxe
+from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
-from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
-from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
 
 
-def make_single_dwc_modelwrapper(Shape, INWidth, OUTWidth, finn_dtype):
-
-    inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, Shape)
-    outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, Shape)
+def make_single_dwc_modelwrapper(shape, inWidth, outWidth, finn_dtype, impl_style):
+    inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, shape)
+    outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, shape)
 
     DWC_node = helper.make_node(
         "StreamingDataWidthConverter_Batch",
@@ -52,17 +51,16 @@ def make_single_dwc_modelwrapper(Shape, INWidth, OUTWidth, finn_dtype):
         ["outp"],
         domain="finn.custom_op.fpgadataflow",
         backend="fpgadataflow",
-        shape=Shape,
-        inWidth=INWidth,
-        outWidth=OUTWidth,
+        shape=shape,
+        inWidth=inWidth,
+        outWidth=outWidth,
         dataType=str(finn_dtype.name),
+        impl_style=impl_style,
     )
 
-    graph = helper.make_graph(
-        nodes=[DWC_node], name="dwc_graph", inputs=[inp], outputs=[outp]
-    )
+    graph = helper.make_graph(nodes=[DWC_node], name="dwc_graph", inputs=[inp], outputs=[outp])
 
-    model = helper.make_model(graph, producer_name="dwc-model")
+    model = qonnx_make_model(graph, producer_name="dwc-model")
     model = ModelWrapper(model)
 
     model.set_tensor_datatype("inp", finn_dtype)
@@ -75,34 +73,40 @@ def prepare_inputs(input_tensor, dt):
     return {"inp": input_tensor}
 
 
-# shape
-@pytest.mark.parametrize("Shape", [[1, 4], [1, 2, 8]])
-# inWidth
-@pytest.mark.parametrize("INWidth", [2, 4])
-# outWidth
-@pytest.mark.parametrize("OUTWidth", [2, 4])
-# finn_dtype
-@pytest.mark.parametrize("finn_dtype", [DataType["BIPOLAR"], DataType["INT2"]])
+@pytest.mark.parametrize(
+    "config",
+    [
+        ([1, 24], 6, 4, DataType["INT2"], "hls"),
+        ([1, 24], 4, 6, DataType["INT2"], "hls"),
+        ([1, 4], 2, 4, DataType["BIPOLAR"], "hls"),
+        ([1, 2, 8], 2, 4, DataType["BIPOLAR"], "hls"),
+        ([1, 4], 4, 2, DataType["INT2"], "hls"),
+        ([1, 2, 8], 4, 4, DataType["INT2"], "hls"),
+        ([1, 2, 8], 8, 16, DataType["INT2"], "vivado"),
+    ],
+)
 @pytest.mark.fpgadataflow
 @pytest.mark.slow
 @pytest.mark.vivado
-def test_fpgadataflow_dwc_rtlsim(Shape, INWidth, OUTWidth, finn_dtype):
-
+def test_fpgadataflow_dwc_rtlsim(config):
+    shape, inWidth, outWidth, finn_dtype, impl_style = config
+    test_fpga_part = "xc7z020clg400-1"
+    target_clk_ns = 10.0
     # generate input data
-    x = gen_finn_dt_tensor(finn_dtype, Shape)
+    x = gen_finn_dt_tensor(finn_dtype, shape)
     input_dict = prepare_inputs(x, finn_dtype)
 
-    model = make_single_dwc_modelwrapper(Shape, INWidth, OUTWidth, finn_dtype)
-
-    model = model.transform(SetExecMode("rtlsim"))
+    model = make_single_dwc_modelwrapper(shape, inWidth, outWidth, finn_dtype, impl_style)
+    model = model.transform(InsertFIFO(create_shallow_fifos=True))
     model = model.transform(GiveUniqueNodeNames())
-    model = model.transform(PrepareIP("xc7z020clg400-1", 5))
+    model = model.transform(PrepareIP(test_fpga_part, 5))
     model = model.transform(HLSSynthIP())
-    model = model.transform(PrepareRTLSim())
+    model = model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns))
+    model.set_metadata_prop("exec_mode", "rtlsim")
     y = oxe.execute_onnx(model, input_dict)["outp"]
 
     assert (
         y == x
     ).all(), """The output values are not the same as the
         input values anymore."""
-    assert y.shape == tuple(Shape), """The output shape is incorrect."""
+    assert y.shape == tuple(shape), """The output shape is incorrect."""
diff --git a/tests/fpgadataflow/test_fpgadataflow_eltwise.py b/tests/fpgadataflow/test_fpgadataflow_eltwise.py
new file mode 100644
index 0000000000..6028a9b9f0
--- /dev/null
+++ b/tests/fpgadataflow/test_fpgadataflow_eltwise.py
@@ -0,0 +1,133 @@
+# Copyright (c) 2022, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pytest
+
+import numpy as np
+import onnx.parser as oprs
+import qonnx.core.data_layout as dl
+from qonnx.core.datatype import DataType
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.registry import getCustomOp
+from qonnx.transformation.general import GiveUniqueNodeNames
+from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.util.basic import gen_finn_dt_tensor
+
+import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
+from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
+from finn.core.onnx_exec import execute_onnx
+from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
+from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+
+
+def build_model(shp, dt0, dt1, do_abs):
+    np.random.seed(0)
+    shp_str = str(shp)
+    if do_abs:
+        graph = """
+        sub_out = Sub(in0, in1)
+        out0 = Abs(sub_out)
+        """
+    else:
+        graph = "out0 = Sub(in0, in1)"
+
+    input = f"""
+    <
+        ir_version: 7,
+        opset_import: ["" : 9]
+    >
+    agraph (float{shp_str} in0, float{shp_str} in1) => (float{shp_str} out0)
+    {{
+        {graph}
+    }}
+    """
+    model = oprs.parse_model(input)
+    model = ModelWrapper(model)
+    model.set_tensor_datatype("in0", dt0)
+    model.set_tensor_datatype("in1", dt1)
+    model.set_tensor_layout("in0", dl.NHWC)
+    model.set_tensor_layout("in1", dl.NHWC)
+    model = model.transform(InferShapes())
+    return model
+
+
+# input datatype for one operand
+@pytest.mark.parametrize("dt0", [DataType["UINT4"], DataType["UINT7"]])
+# channels
+@pytest.mark.parametrize("ch", [1, 64])
+# folding
+@pytest.mark.parametrize("fold", [-1, 2, 1])
+# include Abs output node or not
+@pytest.mark.parametrize("do_abs", [True, False])
+# execution mode
+@pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
+@pytest.mark.fpgadataflow
+@pytest.mark.vivado
+def test_fpgadataflow_eltwise(dt0, ch, fold, do_abs, exec_mode):
+    if fold == -1:
+        pe = 1
+    else:
+        pe = max(1, ch // fold)
+    assert ch % pe == 0
+    dt1 = DataType["UINT8"]
+    shp = [1, 4, 2, ch]
+    model = build_model(shp, dt0, dt1, do_abs)
+    in0 = gen_finn_dt_tensor(dt0, shp)
+    in1 = gen_finn_dt_tensor(dt1, shp)
+    idict = {"in0": in0, "in1": in1}
+    y_expected = execute_onnx(model, idict)["out0"]
+    model = model.transform(to_hls.InferStreamingEltwise())
+    assert len(model.graph.node) == 1
+    assert model.graph.node[0].op_type == "StreamingEltwise"
+    getCustomOp(model.graph.node[0]).set_nodeattr("PE", pe)
+    if exec_mode == "cppsim":
+        model = model.transform(PrepareCppSim())
+        model = model.transform(CompileCppSim())
+        model = model.transform(SetExecMode("cppsim"))
+    elif exec_mode == "rtlsim":
+        model = model.transform(SetExecMode("rtlsim"))
+        model = model.transform(GiveUniqueNodeNames())
+        model = model.transform(PrepareIP("xc7z020clg400-1", 5))
+        model = model.transform(HLSSynthIP())
+        model = model.transform(PrepareRTLSim())
+    else:
+        raise Exception("Unknown exec_mode")
+    y_produced = execute_onnx(model, idict)["out0"]
+    assert (y_produced == y_expected).all(), exec_mode + " failed"
+    if exec_mode == "rtlsim":
+        node = model.get_nodes_by_op_type("StreamingEltwise")[0]
+        inst = getCustomOp(node)
+        cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim")
+        exp_cycles_dict = model.analysis(exp_cycles_per_layer)
+        exp_cycles = exp_cycles_dict[node.name]
+        assert np.isclose(exp_cycles, cycles_rtlsim, atol=10)
+        assert exp_cycles != 0
diff --git a/tests/fpgadataflow/test_fpgadataflow_fifo.py b/tests/fpgadataflow/test_fpgadataflow_fifo.py
index b9c74185d9..27417a78e1 100644
--- a/tests/fpgadataflow/test_fpgadataflow_fifo.py
+++ b/tests/fpgadataflow/test_fpgadataflow_fifo.py
@@ -33,7 +33,7 @@
 from qonnx.core.datatype import DataType
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.transformation.general import GiveUniqueNodeNames
-from qonnx.util.basic import gen_finn_dt_tensor
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
@@ -47,7 +47,6 @@
 
 
 def make_single_fifo_modelwrapper(Shape, Depth, fld_shape, finn_dtype):
-
     inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, Shape)
     outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, Shape)
 
@@ -62,11 +61,9 @@ def make_single_fifo_modelwrapper(Shape, Depth, fld_shape, finn_dtype):
         dataType=str(finn_dtype.name),
     )
 
-    graph = helper.make_graph(
-        nodes=[FIFO_node], name="fifo_graph", inputs=[inp], outputs=[outp]
-    )
+    graph = helper.make_graph(nodes=[FIFO_node], name="fifo_graph", inputs=[inp], outputs=[outp])
 
-    model = helper.make_model(graph, producer_name="fifo-model")
+    model = qonnx_make_model(graph, producer_name="fifo-model")
     model = ModelWrapper(model)
 
     model.set_tensor_datatype("inp", finn_dtype)
@@ -91,7 +88,6 @@ def prepare_inputs(input_tensor, dt):
 @pytest.mark.slow
 @pytest.mark.vivado
 def test_fpgadataflow_fifo_rtlsim(Shape, folded_shape, depth, finn_dtype):
-
     # generate input data
     x = gen_finn_dt_tensor(finn_dtype, Shape)
     input_dict = prepare_inputs(x, finn_dtype)
diff --git a/tests/fpgadataflow/test_fpgadataflow_fmpadding.py b/tests/fpgadataflow/test_fpgadataflow_fmpadding.py
index 2e2da0da7a..c871811c5e 100644
--- a/tests/fpgadataflow/test_fpgadataflow_fmpadding.py
+++ b/tests/fpgadataflow/test_fpgadataflow_fmpadding.py
@@ -36,7 +36,7 @@
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.general import GiveUniqueNodeNames
 from qonnx.transformation.infer_shapes import InferShapes
-from qonnx.util.basic import gen_finn_dt_tensor
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
@@ -53,25 +53,20 @@
 target_clk_ns = 10
 
 
-def make_single_fmpadding_modelwrapper(idim, padding, num_ch, simd, idt, pad_style):
+def make_single_fmpadding_modelwrapper(optype, idim, padding, num_ch, simd, idt):
     pad_h = padding[0] + padding[2]
     pad_w = padding[1] + padding[3]
     idim_h, idim_w = idim
 
-    assert pad_style == 2, "only pad_style == 2 supported in hlslib"
     assert pad_h > 0 or pad_w > 0, "Output dim should be greater than input dim"
     odim_h = idim_h + pad_h
     odim_w = idim_w + pad_w
 
-    inp = helper.make_tensor_value_info(
-        "inp", TensorProto.FLOAT, [1, idim_h, idim_w, num_ch]
-    )
-    outp = helper.make_tensor_value_info(
-        "outp", TensorProto.FLOAT, [1, odim_h, odim_w, num_ch]
-    )
+    inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, idim_h, idim_w, num_ch])
+    outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, odim_h, odim_w, num_ch])
 
     FMPadding = helper.make_node(
-        "FMPadding_Batch",
+        optype,
         ["inp"],
         ["outp"],
         domain="finn.custom_op.fpgadataflow",
@@ -80,7 +75,6 @@ def make_single_fmpadding_modelwrapper(idim, padding, num_ch, simd, idt, pad_sty
         Padding=padding,
         NumChannels=num_ch,
         inputDataType=str(idt.name),
-        PaddingStyle=pad_style,
         numInputVectors=1,
         SIMD=simd,
     )
@@ -89,7 +83,7 @@ def make_single_fmpadding_modelwrapper(idim, padding, num_ch, simd, idt, pad_sty
         nodes=[FMPadding], name="fmpadding_graph", inputs=[inp], outputs=[outp]
     )
 
-    model = helper.make_model(graph, producer_name="fmpadding-model")
+    model = qonnx_make_model(graph, producer_name="fmpadding-model")
     model = ModelWrapper(model)
 
     model.set_tensor_datatype("inp", idt)
@@ -101,21 +95,23 @@ def make_single_fmpadding_modelwrapper(idim, padding, num_ch, simd, idt, pad_sty
 # input image dimension
 @pytest.mark.parametrize("idim", [[8, 8], [10, 8]])
 # number of rows and number of cols to add
-@pytest.mark.parametrize("pad", [[1, 1, 1, 1], [1, 1, 2, 2], [1, 3, 2, 3]])
+@pytest.mark.parametrize("pad", [[1, 1, 1, 1], [1, 1, 2, 2], [1, 3, 2, 3], [7, 0, 8, 0]])
 # number of channels
 @pytest.mark.parametrize("num_ch", [2, 4])
 # Input parallelism
 @pytest.mark.parametrize("simd", [1, 2])
-# PaddingStyle: selects behavior when (odim-idim)%2 != 0
-@pytest.mark.parametrize("pad_style", [2])
 # FINN input datatype
 @pytest.mark.parametrize("idt", [DataType["INT2"], DataType["INT4"]])
 # execution mode
 @pytest.mark.parametrize("mode", ["cppsim", "rtlsim"])
+# implementation style
+@pytest.mark.parametrize("impl_style", ["rtl", "hls"])
 @pytest.mark.fpgadataflow
 @pytest.mark.slow
 @pytest.mark.vivado
-def test_fpgadataflow_fmpadding(idim, pad, num_ch, simd, pad_style, idt, mode):
+def test_fpgadataflow_fmpadding(idim, pad, num_ch, simd, idt, mode, impl_style):
+    if impl_style == "rtl" and mode == "cppsim":
+        pytest.skip("rtl implstyle has no cppsim, skipping")
     if num_ch % simd != 0:
         pytest.skip(" num_ch % simd != 0, skipping")
 
@@ -123,19 +119,15 @@ def test_fpgadataflow_fmpadding(idim, pad, num_ch, simd, pad_style, idt, mode):
     pad_h = pad[0] + pad[2]
     pad_w = pad[1] + pad[3]
 
-    if idim_h == idim_w and pad_h != pad_w:
-        pytest.skip(
-            """Only equal padding along the dimensions for square images
-            is supported, skipping"""
-        )
-
     # generate input data
     x = gen_finn_dt_tensor(idt, [1, idim_h, idim_w, num_ch])
     input_dict = {"inp": x}
     odim_h = idim_h + pad_h
     odim_w = idim_w + pad_w
 
-    model = make_single_fmpadding_modelwrapper(idim, pad, num_ch, simd, idt, pad_style)
+    optype = {"hls": "FMPadding_Batch", "rtl": "FMPadding_rtl"}[impl_style]
+
+    model = make_single_fmpadding_modelwrapper(optype, idim, pad, num_ch, simd, idt)
     model = model.transform(InferShapes())
     model = model.transform(SetExecMode(mode))
     model = model.transform(GiveUniqueNodeNames())
@@ -146,36 +138,17 @@ def test_fpgadataflow_fmpadding(idim, pad, num_ch, simd, pad_style, idt, mode):
         model = model.transform(PrepareIP(test_fpga_part, target_clk_ns))
         model = model.transform(HLSSynthIP())
         model = model.transform(PrepareRTLSim())
+
     y_produced = oxe.execute_onnx(model, input_dict)["outp"]
     expected_oshape = (1, odim_h, odim_w, num_ch)
     assert y_produced.shape == expected_oshape
 
-    # calculate reference
-    # calculate correct pad according to parameters
-    if pad_style == 2:
-        if pad_h % 2 == 0:
-            pad_up = pad_h // 2
-        else:
-            pad_up = pad_h // 2 + 1
-        if pad_w % 2 == 0:
-            pad_left = pad_w // 2
-        else:
-            pad_left = pad_w // 2 + 1
-    else:
-        pad_up = pad_h // 2
-        pad_left = pad_w // 2
-
-    pad_down = pad_h - pad_up
-    pad_right = pad_w - pad_left
-
-    y_expected = np.pad(
-        x, ((0, 0), (pad_up, pad_down), (pad_left, pad_right), (0, 0)), "constant"
-    )
+    y_expected = np.pad(x, ((0, 0), (pad[0], pad[2]), (pad[1], pad[3]), (0, 0)), "constant")
 
     assert (y_produced == y_expected).all()
 
     if mode == "rtlsim":
-        node = model.get_nodes_by_op_type("FMPadding_Batch")[0]
+        node = model.get_nodes_by_op_type(optype)[0]
         inst = getCustomOp(node)
         cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim")
         exp_cycles_dict = model.analysis(exp_cycles_per_layer)
diff --git a/tests/fpgadataflow/test_fpgadataflow_globalaccpool.py b/tests/fpgadataflow/test_fpgadataflow_globalaccpool.py
index a37e6e3271..1b3d87c11f 100644
--- a/tests/fpgadataflow/test_fpgadataflow_globalaccpool.py
+++ b/tests/fpgadataflow/test_fpgadataflow_globalaccpool.py
@@ -34,7 +34,7 @@
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.general import GiveUniqueNodeNames
-from qonnx.util.basic import gen_finn_dt_tensor
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
@@ -61,11 +61,9 @@ def make_accpool_modelwrapper(ch, pe, idim, idt):
         inputDataType=idt.name,
         numInputVectors=[1, idim, idim],
     )
-    graph = helper.make_graph(
-        nodes=[accpool_node], name="graph", inputs=[inp], outputs=[outp]
-    )
+    graph = helper.make_graph(nodes=[accpool_node], name="graph", inputs=[inp], outputs=[outp])
 
-    model = helper.make_model(graph, producer_name="thresholding-model")
+    model = qonnx_make_model(graph, producer_name="thresholding-model")
     model = ModelWrapper(model)
 
     model.set_tensor_datatype("inp", idt)
diff --git a/tests/fpgadataflow/test_fpgadataflow_ipstitch.py b/tests/fpgadataflow/test_fpgadataflow_ipstitch.py
index 80f2d724ad..2d85cc98f4 100644
--- a/tests/fpgadataflow/test_fpgadataflow_ipstitch.py
+++ b/tests/fpgadataflow/test_fpgadataflow_ipstitch.py
@@ -36,7 +36,7 @@
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.general import GiveUniqueNodeNames
 from qonnx.transformation.infer_data_layouts import InferDataLayouts
-from qonnx.util.basic import gen_finn_dt_tensor
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 
 from finn.core.onnx_exec import execute_onnx
 from finn.transformation.fpgadataflow.create_dataflow_partition import (
@@ -96,11 +96,9 @@ def create_one_fc_model(mem_mode="const"):
         mem_mode=mem_mode,
     )
 
-    graph = helper.make_graph(
-        nodes=[fc0], name="fclayer_graph", inputs=[inp], outputs=[outp]
-    )
+    graph = helper.make_graph(nodes=[fc0], name="fclayer_graph", inputs=[inp], outputs=[outp])
 
-    model = helper.make_model(graph, producer_name="fclayer-model")
+    model = qonnx_make_model(graph, producer_name="fclayer-model")
     model = ModelWrapper(model)
 
     model.set_tensor_datatype("inp", idt)
@@ -177,7 +175,7 @@ def create_two_fc_model(mem_mode="decoupled"):
         value_info=[mid],
     )
 
-    model = helper.make_model(graph, producer_name="fclayer-model")
+    model = qonnx_make_model(graph, producer_name="fclayer-model")
     model = ModelWrapper(model)
 
     model.set_tensor_datatype("inp", idt)
@@ -206,16 +204,13 @@ def test_fpgadataflow_ipstitch_gen_model(mem_mode):
         assert sdp_node.__class__.__name__ == "StreamingDataflowPartition"
         assert os.path.isfile(sdp_node.get_nodeattr("model"))
         model = load_test_checkpoint_or_skip(sdp_node.get_nodeattr("model"))
-        model.set_metadata_prop("exec_mode", "remote_pynq")
     model = model.transform(InsertTLastMarker())
     model = model.transform(GiveUniqueNodeNames())
     model = model.transform(PrepareIP(test_fpga_part, 5))
     model = model.transform(HLSSynthIP())
     assert model.graph.node[0].op_type == "MatrixVectorActivation"
     assert model.graph.node[-1].op_type == "TLastMarker"
-    model.save(
-        ip_stitch_model_dir + "/test_fpgadataflow_ipstitch_gen_model_%s.onnx" % mem_mode
-    )
+    model.save(ip_stitch_model_dir + "/test_fpgadataflow_ipstitch_gen_model_%s.onnx" % mem_mode)
 
 
 @pytest.mark.parametrize("mem_mode", ["const", "decoupled"])
@@ -348,6 +343,7 @@ def test_fpgadataflow_ipstitch_vitis_end2end(board, period_ns, extw):
         model = load_test_checkpoint_or_skip(sdp_node.get_nodeattr("model"))
     model = model.transform(GiveUniqueNodeNames())
     model = model.transform(PrepareIP(fpga_part, period_ns))
+    model = model.transform(HLSSynthIP())
     model = model.transform(VitisBuild(fpga_part, period_ns, platform))
     model.save(ip_stitch_model_dir + "/test_fpgadataflow_ipstitch_vitis.onnx")
     assert model.get_metadata_prop("platform") == "alveo"
diff --git a/tests/fpgadataflow/test_fpgadataflow_labelselect.py b/tests/fpgadataflow/test_fpgadataflow_labelselect.py
index a9b98ecaf8..efd093b0b3 100644
--- a/tests/fpgadataflow/test_fpgadataflow_labelselect.py
+++ b/tests/fpgadataflow/test_fpgadataflow_labelselect.py
@@ -33,7 +33,7 @@
 from qonnx.core.datatype import DataType
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.transformation.general import GiveUniqueNodeNames
-from qonnx.util.basic import gen_finn_dt_tensor
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
@@ -67,7 +67,7 @@ def make_labelselect_modelwrapper(labels, pe, k, idt):
         outputs=[outp],
     )
 
-    model = helper.make_model(graph, producer_name="thresholding-model")
+    model = qonnx_make_model(graph, producer_name="thresholding-model")
     model = ModelWrapper(model)
 
     model.set_tensor_datatype("inp", idt)
@@ -81,9 +81,7 @@ def prepare_inputs(input_tensor, idt):
     return {"inp": input_tensor}
 
 
-@pytest.mark.parametrize(
-    "idt", [DataType["UINT8"], DataType["UINT16"], DataType["INT16"]]
-)
+@pytest.mark.parametrize("idt", [DataType["UINT8"], DataType["UINT16"], DataType["INT16"]])
 # labels
 @pytest.mark.parametrize("labels", [10, 100])
 # folding
diff --git a/tests/fpgadataflow/test_fpgadataflow_lookup.py b/tests/fpgadataflow/test_fpgadataflow_lookup.py
index da4204c81a..7951007045 100644
--- a/tests/fpgadataflow/test_fpgadataflow_lookup.py
+++ b/tests/fpgadataflow/test_fpgadataflow_lookup.py
@@ -57,9 +57,7 @@ def make_lookup_model(embeddings, ishape, idt, edt):
     class LookupModel(nn.Module):
         def __init__(self, num_embeddings, embedding_dim):
             super().__init__()
-            self.lookup = nn.Embedding(
-                num_embeddings=num_embeddings, embedding_dim=embedding_dim
-            )
+            self.lookup = nn.Embedding(num_embeddings=num_embeddings, embedding_dim=embedding_dim)
 
         def forward(self, x):
             x = self.lookup(x)
@@ -122,6 +120,7 @@ def test_fpgadataflow_lookup(edt, embedding_cfg, exec_mode):
     assert model.graph.node[0].input[1] == ename
     assert model.graph.node[0].output[0] == oname
     if exec_mode == "cppsim":
+        model = model.transform(GiveUniqueNodeNames())
         model = model.transform(PrepareCppSim())
         model = model.transform(CompileCppSim())
         model = model.transform(SetExecMode("cppsim"))
diff --git a/tests/fpgadataflow/test_fpgadataflow_mvau.py b/tests/fpgadataflow/test_fpgadataflow_mvau.py
index d1895a1267..b80ef76a19 100644
--- a/tests/fpgadataflow/test_fpgadataflow_mvau.py
+++ b/tests/fpgadataflow/test_fpgadataflow_mvau.py
@@ -36,12 +36,17 @@
 from qonnx.custom_op.general.multithreshold import multithreshold
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.general import GiveUniqueNodeNames
-from qonnx.util.basic import calculate_signed_dot_prod_range, gen_finn_dt_tensor
+from qonnx.util.basic import (
+    calculate_signed_dot_prod_range,
+    gen_finn_dt_tensor,
+    qonnx_make_model,
+)
 
 import finn.core.onnx_exec as oxe
 from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
 from finn.analysis.fpgadataflow.hls_synth_res_estimation import hls_synth_res_estimation
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
+from finn.transformation.fpgadataflow.derive_characteristic import DeriveCharacteristic
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
 from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
@@ -105,7 +110,7 @@ def make_single_fclayer_modelwrapper(W, pe, simd, wdt, idt, odt, T=None, tdt=Non
         nodes=[FCLayer_node], name="fclayer_graph", inputs=[inp], outputs=[outp]
     )
 
-    model = helper.make_model(graph, producer_name="fclayer-model")
+    model = qonnx_make_model(graph, producer_name="fclayer-model")
     model = ModelWrapper(model)
 
     model.set_tensor_datatype("inp", idt)
@@ -417,3 +422,67 @@ def test_fpgadataflow_fclayer_large_depth_decoupled_mode_rtlsim(
     exp_cycles = exp_cycles_dict[node.name]
     assert np.isclose(exp_cycles, cycles_rtlsim, atol=15)
     assert exp_cycles != 0
+
+
+# mem_mode: const or decoupled
+@pytest.mark.parametrize("mem_mode", ["decoupled", "const"])
+# activation: None or DataType
+@pytest.mark.parametrize("act", [DataType["INT4"]])
+# weight datatype
+@pytest.mark.parametrize("wdt", [DataType["INT4"]])
+# input datatype
+@pytest.mark.parametrize("idt", [DataType["INT4"]])
+# neuron folding, -1 is maximum possible
+@pytest.mark.parametrize("nf", [8])
+# synapse folding, -1 is maximum possible
+@pytest.mark.parametrize("sf", [8])
+# HLS matrix width (input features)
+@pytest.mark.parametrize("mw", [32])
+# HLS matrix height (output features)
+@pytest.mark.parametrize("mh", [32])
+@pytest.mark.fpgadataflow
+@pytest.mark.vivado
+def test_fclayer_fifocharacterize_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh):
+    if nf == -1:
+        nf = mh
+    if sf == -1:
+        sf = mw
+    pe = mh // nf
+    simd = mw // sf
+    assert mh % pe == 0
+    assert mw % sf == 0
+    # generate weights
+    W = gen_finn_dt_tensor(wdt, (mw, mh))
+
+    # no activation, produce accumulators
+    T = None
+    tdt = None
+    if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]:
+        odt = DataType["UINT32"]
+    else:
+        odt = DataType["INT32"]
+
+    model = make_single_fclayer_modelwrapper(W, pe, simd, wdt, idt, odt, T, tdt)
+    for node in model.graph.node:
+        # lookup op_type in registry of CustomOps
+        inst = getCustomOp(node)
+        inst.set_nodeattr("mem_mode", mem_mode)
+    total_fold = nf * sf
+    exp_total_cycles = total_fold + 10
+    model = model.transform(SetExecMode("rtlsim"))
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(PrepareIP("xc7z020clg400-1", 5))
+    model = model.transform(HLSSynthIP())
+    model = model.transform(PrepareRTLSim())
+    model = model.transform(DeriveCharacteristic(exp_total_cycles))
+    node_inst = getCustomOp(model.graph.node[0])
+    period_attr = node_inst.get_nodeattr("io_chrc_period")
+    assert period_attr == exp_total_cycles
+    chrc_in = node_inst.get_nodeattr("io_chrc_in")
+    chrc_out = node_inst.get_nodeattr("io_chrc_out")
+    assert chrc_in.shape == (1, 2 * exp_total_cycles)
+    assert chrc_out.shape == (1, 2 * exp_total_cycles)
+    # first sf cycles should read input continuously
+    assert (chrc_in[0, :sf] == range(1, sf + 1)).all()
+    # all outputs should be produced within the exp n of cycles
+    assert chrc_out[0, exp_total_cycles] == nf
diff --git a/tests/fpgadataflow/test_fpgadataflow_res_estimate.py b/tests/fpgadataflow/test_fpgadataflow_res_estimate.py
index e3c79fa44f..2ff7dd8b32 100644
--- a/tests/fpgadataflow/test_fpgadataflow_res_estimate.py
+++ b/tests/fpgadataflow/test_fpgadataflow_res_estimate.py
@@ -32,6 +32,7 @@
 from qonnx.core.datatype import DataType
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.transformation.general import GiveUniqueNodeNames
+from qonnx.util.basic import qonnx_make_model
 
 from finn.analysis.fpgadataflow.res_estimation import (
     res_estimation,
@@ -87,7 +88,7 @@ def test_res_estimate():
         nodes=[FCLayer_node], name="fclayer_graph", inputs=[inp], outputs=[outp]
     )
 
-    model = helper.make_model(graph, producer_name="fclayer-model")
+    model = qonnx_make_model(graph, producer_name="fclayer-model")
     model = ModelWrapper(model)
 
     model.set_tensor_datatype("inp", idt)
@@ -100,7 +101,7 @@ def test_res_estimate():
         "MatrixVectorActivation_0": {
             "BRAM_18K": 0,
             "BRAM_efficiency": 1,
-            "LUT": 357,
+            "LUT": 317,
             "DSP": 0,
             "URAM_efficiency": 1,
             "URAM": 0,
@@ -118,7 +119,7 @@ def test_res_estimate():
             {
                 "BRAM_18K": 0,
                 "BRAM_efficiency": 1,
-                "LUT": 352,
+                "LUT": 313,
                 "DSP": 1,
                 "URAM": 0,
                 "URAM_efficiency": 1,
@@ -126,7 +127,7 @@ def test_res_estimate():
             {
                 "BRAM_18K": 0,
                 "BRAM_efficiency": 1,
-                "LUT": 357,
+                "LUT": 317,
                 "DSP": 0,
                 "URAM": 0,
                 "URAM_efficiency": 1,
diff --git a/tests/fpgadataflow/test_fpgadataflow_streamingmaxpool.py b/tests/fpgadataflow/test_fpgadataflow_streamingmaxpool.py
index a3968cf797..67a40d96f3 100644
--- a/tests/fpgadataflow/test_fpgadataflow_streamingmaxpool.py
+++ b/tests/fpgadataflow/test_fpgadataflow_streamingmaxpool.py
@@ -35,7 +35,7 @@
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.general import GiveUniqueNodeNames
 from qonnx.transformation.infer_shapes import InferShapes
-from qonnx.util.basic import gen_finn_dt_tensor
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
@@ -53,9 +53,7 @@ def make_single_maxpoolnhwc_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, idt, ceil_
     ifm_dim_h, ifm_dim_w = ifm_dim
     ofm_dim_h, ofm_dim_w = ofm_dim
     odt = idt
-    inp = helper.make_tensor_value_info(
-        "inp", TensorProto.FLOAT, [1, ifm_dim_h, ifm_dim_w, ifm_ch]
-    )
+    inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, ifm_dim_h, ifm_dim_w, ifm_ch])
     outp = helper.make_tensor_value_info(
         "outp", TensorProto.FLOAT, [1, ofm_dim_h, ofm_dim_w, ifm_ch]
     )
@@ -70,11 +68,9 @@ def make_single_maxpoolnhwc_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, idt, ceil_
         ceil_mode=ceil_mode,
         pads=[0, 0, 0, 0],
     )
-    graph = helper.make_graph(
-        nodes=[mp_node], name="mp_graph", inputs=[inp], outputs=[outp]
-    )
+    graph = helper.make_graph(nodes=[mp_node], name="mp_graph", inputs=[inp], outputs=[outp])
 
-    model = helper.make_model(graph, producer_name="mp-model")
+    model = qonnx_make_model(graph, producer_name="mp-model")
     model = ModelWrapper(model)
 
     model.set_tensor_datatype("inp", idt)
@@ -106,9 +102,7 @@ def prepare_inputs(input_tensor):
 @pytest.mark.fpgadataflow
 @pytest.mark.slow
 @pytest.mark.vivado
-def test_fpgadataflow_streamingmaxpool(
-    idt, dim_1d, k, ifm_dim, ifm_ch, pe, ceil_mode, exec_mode
-):
+def test_fpgadataflow_streamingmaxpool(idt, dim_1d, k, ifm_dim, ifm_ch, pe, ceil_mode, exec_mode):
     ifm_dim_h = ifm_dim
     k_h = k
     if dim_1d:
@@ -138,9 +132,7 @@ def test_fpgadataflow_streamingmaxpool(
     # prepare input data
     input_dict = prepare_inputs(x)
 
-    golden = make_single_maxpoolnhwc_modelwrapper(
-        k, ifm_ch, ifm_dim, ofm_dim, idt, ceil_mode
-    )
+    golden = make_single_maxpoolnhwc_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, idt, ceil_mode)
     y_expected = oxe.execute_onnx(golden, input_dict)["outp"]
 
     model = golden.transform(InferStreamingMaxPool())
diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding.py b/tests/fpgadataflow/test_fpgadataflow_thresholding.py
index 706679b680..2b7bc28a10 100644
--- a/tests/fpgadataflow/test_fpgadataflow_thresholding.py
+++ b/tests/fpgadataflow/test_fpgadataflow_thresholding.py
@@ -37,7 +37,7 @@
 from qonnx.custom_op.general.multithreshold import multithreshold
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.general import GiveUniqueNodeNames
-from qonnx.util.basic import gen_finn_dt_tensor
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
@@ -56,17 +56,11 @@
 target_clk_ns = 5
 
 
-def make_single_thresholding_modelwrapper(
-    T, pe, idt, odt, actval, mem_mode, n_inp_vecs
-):
+def make_single_thresholding_modelwrapper(T, pe, idt, odt, actval, mem_mode, n_inp_vecs):
     NumChannels = T.shape[0]
 
-    inp = helper.make_tensor_value_info(
-        "inp", TensorProto.FLOAT, n_inp_vecs + [NumChannels]
-    )
-    outp = helper.make_tensor_value_info(
-        "outp", TensorProto.FLOAT, n_inp_vecs + [NumChannels]
-    )
+    inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, n_inp_vecs + [NumChannels])
+    outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, n_inp_vecs + [NumChannels])
 
     node_inp_list = ["inp", "thresh"]
 
@@ -93,7 +87,7 @@ def make_single_thresholding_modelwrapper(
         outputs=[outp],
     )
 
-    model = helper.make_model(graph, producer_name="thresholding-model")
+    model = qonnx_make_model(graph, producer_name="thresholding-model")
     model = ModelWrapper(model)
 
     model.set_tensor_datatype("inp", idt)
@@ -132,10 +126,6 @@ def test_fpgadataflow_thresholding(idt, act, nf, ich, exec_mode, mem_mode):
     odt = act
     n_steps = act.get_num_possible_values() - 1
     T = np.random.randint(idt.min(), idt.max() + 1, (ich, n_steps)).astype(np.float32)
-    # make the vivado_hls threshold bug appear (incorrect rtlsim result when first
-    # threshold of first channel is zero, while using BIPOLAR output)
-    if act == DataType["BIPOLAR"]:
-        T[0][0] = 0
     # provide non-decreasing thresholds
     T = np.sort(T, axis=1)
 
@@ -144,9 +134,7 @@ def test_fpgadataflow_thresholding(idt, act, nf, ich, exec_mode, mem_mode):
     else:
         actval = odt.min()
 
-    model = make_single_thresholding_modelwrapper(
-        T, pe, idt, odt, actval, mem_mode, n_inp_vecs
-    )
+    model = make_single_thresholding_modelwrapper(T, pe, idt, odt, actval, mem_mode, n_inp_vecs)
 
     if exec_mode == "cppsim":
         model = model.transform(PrepareCppSim())
@@ -223,9 +211,7 @@ def test_runtime_thresholds_single_layer():
     else:
         actval = odt.min()
 
-    model = make_single_thresholding_modelwrapper(
-        T, pe, idt, odt, actval, mem_mode, n_inp_vecs
-    )
+    model = make_single_thresholding_modelwrapper(T, pe, idt, odt, actval, mem_mode, n_inp_vecs)
     op_inst = getCustomOp(model.graph.node[0])
     op_inst.set_nodeattr("runtime_writeable_weights", 1)
     op_inst.make_weight_file(T, "decoupled_runtime", "old_weights.dat")
@@ -252,9 +238,7 @@ def test_runtime_thresholds_single_layer():
     def read_weights(sim):
         addr = 0
         for i in range(len(old_weight_stream)):
-            extracted_weight_stream.append(
-                axilite_read(sim, addr, basename="s_axilite_0_")
-            )
+            extracted_weight_stream.append(axilite_read(sim, addr, basename="s_axilite_0_"))
             addr += 4
 
     rtlsim_exec(model, exec_ctx, pre_hook=read_weights)
@@ -277,9 +261,7 @@ def read_weights(sim):
         expected += act.min()
     assert (y == expected).all()
 
-    new_weights = np.random.randint(idt.min(), idt.max() + 1, (ich, n_steps)).astype(
-        np.float32
-    )
+    new_weights = np.random.randint(idt.min(), idt.max() + 1, (ich, n_steps)).astype(np.float32)
     # provide non-decreasing thresholds
     new_weights = np.sort(T, axis=1)
     op_inst.make_weight_file(new_weights, "decoupled_runtime", "new_weights.dat")
diff --git a/tests/fpgadataflow/test_fpgadataflow_upsampler.py b/tests/fpgadataflow/test_fpgadataflow_upsampler.py
index d1ef0b890a..a08d31f7b0 100644
--- a/tests/fpgadataflow/test_fpgadataflow_upsampler.py
+++ b/tests/fpgadataflow/test_fpgadataflow_upsampler.py
@@ -30,6 +30,7 @@
 
 import numpy as np
 import os
+import shutil
 import torch
 from brevitas.export import FINNManager
 from qonnx.core.datatype import DataType
@@ -51,6 +52,7 @@
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+from finn.util.basic import make_build_dir
 
 tmpdir = os.environ["FINN_BUILD_DIR"]
 
@@ -117,7 +119,7 @@ def forward(self, x):
 
 # param datatype
 @pytest.mark.parametrize("dt", [DataType["INT8"]])
-# Width/height of square input feature map
+# spatial dim input feature map
 @pytest.mark.parametrize("IFMDim", [3, 5])
 # upscaling factor
 @pytest.mark.parametrize("scale", [2, 3])
@@ -125,14 +127,22 @@ def forward(self, x):
 @pytest.mark.parametrize("NumChannels", [4])
 # execution mode
 @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
+# whether to use 1D or 2D square testcases
+@pytest.mark.parametrize("is_1d", [False, True])
 @pytest.mark.fpgadataflow
 @pytest.mark.vivado
 @pytest.mark.slow
-def test_fpgadataflow_upsampler(dt, IFMDim, scale, NumChannels, exec_mode):
+def test_fpgadataflow_upsampler(dt, IFMDim, scale, NumChannels, exec_mode, is_1d):
+    tmpdir = make_build_dir("upsample_export_")
     atol = 1e-3
+    if is_1d:
+        input_shape = (1, NumChannels, IFMDim, 1)
+        upscale_factor = (scale, 1)
+    else:
+        input_shape = (1, NumChannels, IFMDim, IFMDim)
+        upscale_factor = (scale, scale)
     # Create the test model and inputs for it
-    torch_model = PyTorchTestModel(upscale_factor=scale)
-    input_shape = (1, NumChannels, IFMDim, IFMDim)
+    torch_model = PyTorchTestModel(upscale_factor=upscale_factor)
     test_in = torch.arange(0, np.prod(np.asarray(input_shape)))
     # Limit the input to values valid for the given datatype
     test_in %= dt.max() - dt.min() + 1
@@ -200,3 +210,4 @@ def test_fpgadataflow_upsampler(dt, IFMDim, scale, NumChannels, exec_mode):
         assert output_matches, "Cppsim output doesn't match ONNX/PyTorch."
     elif exec_mode == "rtlsim":
         assert output_matches, "Rtlsim output doesn't match ONNX/PyTorch."
+    shutil.rmtree(tmpdir, ignore_errors=True)
diff --git a/tests/fpgadataflow/test_fpgadataflow_vvau.py b/tests/fpgadataflow/test_fpgadataflow_vvau.py
index c48448787d..4208169c0b 100644
--- a/tests/fpgadataflow/test_fpgadataflow_vvau.py
+++ b/tests/fpgadataflow/test_fpgadataflow_vvau.py
@@ -35,7 +35,9 @@
 from qonnx.custom_op.general.multithreshold import multithreshold
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.general import GiveUniqueNodeNames
-from qonnx.util.basic import gen_finn_dt_tensor
+from qonnx.transformation.infer_datatypes import InferDataTypes
+from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
@@ -75,7 +77,20 @@ def _calculate_dot_prod_range(dt_a, dt_b, len):
 
 
 def _make_single_vvau_modelwrapper(
-    W, pe, k_h, k_w, channels, dim_h, dim_w, wdt, idt, odt, T=None, tdt=None
+    W,
+    pe,
+    simd,
+    k_h,
+    k_w,
+    channels,
+    dim_h,
+    dim_w,
+    wdt,
+    idt,
+    odt,
+    T=None,
+    tdt=None,
+    mem_mode="const",
 ):
     in_shape = [1, dim_h, dim_w, k_h * k_w * channels]  # [N, H, W, K*K*CH]
     out_shape = [
@@ -91,7 +106,10 @@ def _make_single_vvau_modelwrapper(
     if T is not None:
         no_act = 0
         node_inp_list = ["inp", "weights", "thresh"]
-        actval = odt.min()
+        if odt == DataType["BIPOLAR"]:
+            actval = 0
+        else:
+            actval = odt.min()
     else:
         no_act = 1
         node_inp_list = ["inp", "weights"]
@@ -104,6 +122,7 @@ def _make_single_vvau_modelwrapper(
         domain="finn.custom_op.fpgadataflow",
         backend="fpgadataflow",
         PE=pe,
+        SIMD=simd,
         Dim=[dim_h, dim_w],
         Channels=channels,
         Kernel=[k_h, k_w],
@@ -113,13 +132,12 @@ def _make_single_vvau_modelwrapper(
         weightDataType=wdt.name,
         outputDataType=odt.name,
         noActivation=no_act,
+        mem_mode=mem_mode,
     )
 
-    graph = helper.make_graph(
-        nodes=[VVAU_node], name="vvau_graph", inputs=[inp], outputs=[outp]
-    )
+    graph = helper.make_graph(nodes=[VVAU_node], name="vvau_graph", inputs=[inp], outputs=[outp])
 
-    model = helper.make_model(graph, producer_name="vvau-model")
+    model = qonnx_make_model(graph, producer_name="vvau-model")
     model = ModelWrapper(model)
 
     model.set_tensor_datatype("inp", idt)
@@ -133,6 +151,9 @@ def _make_single_vvau_modelwrapper(
         model.set_tensor_datatype("thresh", tdt)
         model.set_initializer("thresh", T)
 
+    model = model.transform(InferShapes())
+    model = model.transform(InferDataTypes())
+
     return model
 
 
@@ -140,14 +161,16 @@ def prepare_inputs(input_tensor):
     return {"inp": input_tensor}
 
 
-# mem_mode: const or decoupled
-@pytest.mark.parametrize("idt", [DataType["UINT4"], DataType["UINT8"]])
+# input datatype
+@pytest.mark.parametrize("idt", [DataType["BIPOLAR"], DataType["UINT4"]])
 # weight datatype
-@pytest.mark.parametrize("wdt", [DataType["INT4"]])
+@pytest.mark.parametrize("wdt", [DataType["BIPOLAR"], DataType["UINT4"]])
 # activation: None or DataType
-@pytest.mark.parametrize("act", [DataType["UINT4"], None])
+@pytest.mark.parametrize("act", [DataType["BIPOLAR"], DataType["UINT4"], None])
 # PE
-@pytest.mark.parametrize("pe", [1, "channels"])
+@pytest.mark.parametrize("pe", [1, 3, 6])
+# SIMD
+@pytest.mark.parametrize("simd", [1, 9])
 # Input image shape
 @pytest.mark.parametrize("dim_h", [10])
 @pytest.mark.parametrize("dim_w", [10, 1])
@@ -155,29 +178,29 @@ def prepare_inputs(input_tensor):
 @pytest.mark.parametrize("k_h", [3])
 @pytest.mark.parametrize("k_w", [3, 1])
 # Number of input and output channels
-@pytest.mark.parametrize("channels", [3, 4])
+@pytest.mark.parametrize("channels", [3, 6])
+# memory mode
+@pytest.mark.parametrize("mem_mode", ["const", "decoupled"])
 # execution mode
 @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
 @pytest.mark.fpgadataflow
 @pytest.mark.slow
 @pytest.mark.vivado
 def test_fpgadataflow_vvau(
-    idt, wdt, act, pe, dim_h, dim_w, k_h, k_w, channels, exec_mode
+    idt, wdt, act, pe, simd, dim_h, dim_w, k_h, k_w, channels, mem_mode, exec_mode
 ):
-    if pe == "channels":
-        pe = channels
-
     if dim_w == 1 and k_w != 1:
         pytest.skip("1D image requires 1D kernel, skipping.")
 
     if channels % pe != 0:
         pytest.skip("Requirement Channels divisable by PE is violated.")
 
+    if (k_h * k_w) % simd != 0:
+        pytest.skip("Requirement kernel (k_h * k_w) divisable by SIMD is violated.")
+
     # Generate weights in expected shape for ONNX and HLS node
     W = gen_finn_dt_tensor(wdt, (channels, 1, k_h, k_w))  # shape: [channels, 1, k, k]
-    W_onnx = _infer_sparse_weight_tensor(
-        W, k_h, k_w, channels
-    )  # shape: [k*k*channels, channels]
+    W_onnx = _infer_sparse_weight_tensor(W, k_h, k_w, channels)  # shape: [k*k*channels, channels]
 
     # Generate inputs in expected format for ONNX and HLS node
     x = gen_finn_dt_tensor(idt, (1, dim_h, dim_w, k_h * k_w * channels))
@@ -188,17 +211,26 @@ def test_fpgadataflow_vvau(
     if act is None:
         T = None
         tdt = None
-        odt = DataType["INT32"]
+        if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]:
+            odt = DataType["UINT32"]
+        else:
+            odt = DataType["INT32"]
     else:
         odt = act
-        (min_v, max_v) = _calculate_dot_prod_range(idt, wdt, k_h * k_w * channels)
+        (min_v, max_v) = _calculate_dot_prod_range(idt, wdt, k_h * k_w)
         n_steps = act.get_num_possible_values() - 1
         T = np.random.randint(min_v, max_v - 1, (channels, n_steps)).astype(np.float32)
         T = np.sort(T, axis=1)
-        tdt = DataType["INT32"]
+        if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]:
+            tdt = DataType["UINT32"]
+            # bias thresholds to be positive
+            T = np.ceil((T + (k_h * k_w)) / 2)
+            assert (T >= 0).all()
+        else:
+            tdt = DataType["INT32"]
 
     model = _make_single_vvau_modelwrapper(
-        W, pe, k_h, k_w, channels, dim_h, dim_w, wdt, idt, odt, T, tdt
+        W, pe, simd, k_h, k_w, channels, dim_h, dim_w, wdt, idt, odt, T, tdt, mem_mode
     )
 
     if exec_mode == "cppsim":
@@ -217,20 +249,29 @@ def test_fpgadataflow_vvau(
     input_dict = prepare_inputs(x_vvau)
 
     # Calculate output
-    y_expected = np.matmul(x, W_onnx)  # Y is in [N, H, W, C] format
+    if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]:
+        # Simulate XNOR-popcount matrix multiplication, see
+        # qonnx.custom_op.general.xnorpopcount (not usable due to sparse W)
+        y_expected = np.matmul(x, W_onnx)
+        y_expected = (y_expected + (k_h * k_w)) / 2
+    else:
+        y_expected = np.matmul(x, W_onnx)  # Y is in [N, H, W, C] format
+
     if T is not None:
         # Reshape Y, as multithreshold expects Y to be in [N, C, H, W] format
         y_expected = np.transpose(y_expected, (0, 3, 1, 2))
         y_expected = multithreshold(y_expected, T)
         y_expected = np.transpose(y_expected, (0, 2, 3, 1))
-        # signed offset
-        y_expected += act.min()
+        if act == DataType["BIPOLAR"]:
+            # binary to bipolar
+            y_expected = 2 * y_expected - 1
+        else:
+            # signed offset
+            y_expected += act.min()
 
-    y_produced = oxe.execute_onnx(model, input_dict, return_full_exec_context=False)[
-        "outp"
-    ]
+    y_produced = oxe.execute_onnx(model, input_dict, return_full_exec_context=False)["outp"]
 
-    assert (y_produced == y_expected).all(), "cppsim failed"
+    assert (y_produced == y_expected).all(), "incorrect result"
 
     if exec_mode == "rtlsim":
         node = model.get_nodes_by_op_type("VectorVectorActivation")[0]
diff --git a/tests/fpgadataflow/test_minimize_bit_width.py b/tests/fpgadataflow/test_minimize_bit_width.py
new file mode 100644
index 0000000000..0e704230e7
--- /dev/null
+++ b/tests/fpgadataflow/test_minimize_bit_width.py
@@ -0,0 +1,308 @@
+# Copyright (C) 2023, Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pytest
+
+import numpy as np
+from onnx import TensorProto, helper
+from qonnx.core.datatype import BipolarType, DataType, IntType
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.registry import getCustomOp
+from qonnx.util.basic import gen_finn_dt_tensor, roundup_to_integer_multiple
+from typing import Optional, Union
+
+from finn.custom_op.fpgadataflow.matrixvectoractivation import MatrixVectorActivation
+from finn.custom_op.fpgadataflow.vectorvectoractivation import VectorVectorActivation
+from finn.transformation.fpgadataflow.minimize_accumulator_width import (
+    MinimizeAccumulatorWidth,
+)
+from finn.transformation.fpgadataflow.minimize_weight_bit_width import (
+    MinimizeWeightBitWidth,
+)
+
+
+def make_unit_test_model(wdt: DataType, idt: DataType, tdt: Optional[DataType] = None):
+    """Creates a toy finn-onnx model for unit testing. The VVAU-MVAU pair is based
+    on the first pair of MobileNetV1"""
+    inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, 32, 32, 288])
+    outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, 32, 32, 64])
+    layer1 = helper.make_node(
+        "VectorVectorActivation",
+        ["inp", "params0", "thresh0"] if tdt is not None else ["inp", "params0"],
+        ["hid"],
+        domain="finn.custom_op.fpgadataflow",
+        backend="fpgadataflow",
+        PE=1,
+        Channels=32,
+        Dim=(32, 32),
+        Kernel=(3, 3),
+        inputDataType=idt.name,
+        outputDataType=idt.name,
+        weightDataType=wdt.name,
+        ActVal=tdt.min() if tdt is not None else 0,
+        noActivation=0 if tdt is not None else 1,
+    )
+    layer2 = helper.make_node(
+        "MatrixVectorActivation",
+        ["hid", "params1", "thresh1"] if tdt is not None else ["hid", "params1"],
+        ["outp"],
+        domain="finn.custom_op.fpgadataflow",
+        backend="fpgadataflow",
+        MW=32,  # matrix_width (num_inputs)
+        MH=64,  # matrix_height (num_outputs)
+        SIMD=1,
+        PE=1,
+        inputDataType=idt.name,
+        outputDataType=idt.name,
+        weightDataType=wdt.name,
+        ActVal=tdt.min() if tdt is not None else 0,
+        noActivation=0 if tdt is not None else 1,
+        binaryXnorMode=0,
+    )
+    graph = helper.make_graph(
+        nodes=[layer1, layer2], name="fclayer_graph", inputs=[inp], outputs=[outp]
+    )
+
+    model = helper.make_model(graph, producer_name="fclayer-model")
+    model = ModelWrapper(model)
+
+    model.set_tensor_datatype("inp", idt)
+    model.set_tensor_datatype("outp", idt)
+    model.set_tensor_datatype("hid", idt)
+    model.set_tensor_datatype("params0", wdt)
+    model.set_tensor_datatype("params1", wdt)
+    model.set_initializer("params0", gen_finn_dt_tensor(wdt, (32, 1, 3, 3)))
+    model.set_initializer("params1", gen_finn_dt_tensor(wdt, (32, 64)))
+    # if the threshold data type is specified, then we need to generate
+    # some dummy threshold values
+    if tdt is not None:
+        model.set_tensor_datatype("thresh0", tdt)
+        model.set_tensor_datatype("thresh1", tdt)
+        # Create threshold tensors
+        n_steps: int = idt.get_num_possible_values() - 1
+        thresholds: Optional[np.ndarray] = np.random.randint(
+            tdt.min(), tdt.max() - 1, (32, n_steps)
+        ).astype(
+            np.float32
+        )  # generate thresholds for the activations
+        thresholds = np.sort(thresholds, axis=1)  # provide non-decreasing thresholds
+        model.set_initializer("thresh0", thresholds)
+        thresholds: Optional[np.ndarray] = np.random.randint(
+            tdt.min(), tdt.max() - 1, (64, n_steps)
+        ).astype(
+            np.float32
+        )  # generate thresholds for the activations
+        thresholds = np.sort(thresholds, axis=1)  # provide non-decreasing thresholds
+        model.set_initializer("thresh1", thresholds)
+    return model
+
+
+weight_data_types = [
+    DataType["INT8"],
+    DataType["UINT8"],
+    DataType["INT7"],
+    DataType["UINT7"],
+    DataType["INT3"],
+    DataType["UINT3"],
+    # DataType["BIPOLAR"], # TODO - add support for bipolar weights
+    DataType["TERNARY"],
+]
+
+
+input_data_types = [
+    DataType["INT8"],
+    DataType["UINT8"],
+    DataType["INT3"],
+    DataType["UINT3"],
+    DataType["BIPOLAR"],
+    DataType["TERNARY"],
+]
+
+
+@pytest.mark.parametrize("wdt", weight_data_types)
+@pytest.mark.parametrize("rww", [True, False])
+@pytest.mark.fpgadataflow
+def test_minimize_weight_bit_width(wdt: DataType, rww: bool):
+    """Testing MinimizeWeightBitWidth for VVAU and MVAU.
+
+    :param wdt: (DataType) The data type that we are testing for the weights
+    :param rww: (bool) Whether or not to use runtime-writeable weights"""
+    if isinstance(wdt, BipolarType):
+        # current MinimizeWeightBitWidth sets {-1,1} to INT2, need to check
+        # for 0 in weights to minimize weight bit width to bipolar
+        pytest.skip("Not well-supported for this optimization")
+
+    # Create a w8a8 model
+    def_wdt = DataType["UINT8"]
+    model = make_unit_test_model(def_wdt, DataType["INT8"])
+
+    # Create new weights for the model based on wdt
+    params0 = gen_finn_dt_tensor(wdt, (32, 1, 3, 3))
+    params1 = gen_finn_dt_tensor(wdt, (32, 64))
+    model.set_initializer("params0", params0)
+    model.set_initializer("params1", params1)
+
+    # If runtime-writeable weights, specify as a node attribute
+    for node in model.graph.node:
+        inst = getCustomOp(node)
+        if isinstance(inst, (MatrixVectorActivation, VectorVectorActivation)):
+            inst.set_nodeattr("runtime_writeable_weights", int(rww))
+
+    # Apply the optimization
+    model = model.transform(MinimizeWeightBitWidth())
+
+    # Iterate through each node to make sure it functioned properly
+    for node in model.graph.node:
+        inst = getCustomOp(node)
+        if isinstance(inst, (MatrixVectorActivation, VectorVectorActivation)):
+            cur_wdt = DataType[inst.get_nodeattr("weightDataType")]
+            exp_wdt = def_wdt if rww else wdt
+            assert cur_wdt.bitwidth() == exp_wdt.bitwidth(), "Mismatched data types"
+
+
+def calculate_accumulator_bit_width(
+    inst: Union[MatrixVectorActivation, VectorVectorActivation], model: ModelWrapper
+) -> Union[DataType, IntType]:
+    """Calculate the accumulator bit width using the closed-form expressions
+    derived in `Quantized Neural Networks for Low-Precision Accumulation
+    with Guaranteed Overflow Avoidance` (2023) by I.Colbert, A. Pappalardo,
+    J. Petri-Koenig
+
+    :param inst: (HLSCustomOp) The instance of the MVAU or VVAU
+    :param model: (ModelWrapper) The instance of the whole model
+    """
+
+    def phi(x: float) -> float:
+        return np.log2(1 + pow(2, -x))
+
+    weights = model.get_initializer(inst.onnx_node.input[1])
+    # since in the calculation the values of the weight matrix are used,
+    # for the bipolar case they need to be converted to bipolar
+    if inst.get_nodeattr("binaryXnorMode"):
+        weights = 2 * weights - 1
+    # modify the weights based on if the node is a VVAU or MVAU
+    if isinstance(inst, MatrixVectorActivation):
+        K = inst.get_nodeattr("MW")  # matrix_width = num_inputs
+    elif isinstance(inst, VectorVectorActivation):
+        k_h, k_w = inst.get_nodeattr("Kernel")
+        K = k_h * k_w  # size of kernels = num_inputs
+        fm = inst.get_nodeattr("Channels")
+        # put weights into the shape expected by calculate_matvec_accumulator_range
+        weights = weights.reshape(fm, k_h * k_w).transpose()
+    else:
+        raise Exception("Considering only MVAU and VVAU currently")
+    # collect attributes used to determine the accumulator bit width bound
+    wdt = inst.get_weight_datatype()
+    idt = inst.get_input_datatype()
+    rww = inst.get_nodeattr("runtime_writeable_weights")
+    # if runtime-writeable weights, then use the lower bound on the accumulator bit
+    # width as determined by the input and weight data types and size of dot product
+    if rww:
+        alpha = np.log2(K) + idt.bitwidth() + wdt.bitwidth() - 1.0 - float(idt.signed())
+        P = np.ceil(alpha + phi(alpha) + 1.0)
+    # if not runtime-writable weights, then use the tighter bound on the accumulator
+    # bit width as determined by the weight values themselves
+    else:
+        beta = np.log2(abs(weights).sum(axis=0).max()) + idt.bitwidth() - float(idt.signed())
+        P = np.ceil(beta + phi(beta) + 1.0)
+    # if the node is the last in the graph, then round up to the nearest 8 bits
+    if model.find_direct_successors(inst.onnx_node) is None:
+        P = roundup_to_integer_multiple(P, 8)
+    return DataType[f"INT{int(P)}"]
+
+
+thresh_data_types = [
+    None,
+    DataType["INT32"],
+    DataType["INT24"],
+    DataType["INT16"],
+]
+
+# Removing unsigned data types fro weights
+weight_data_types = [
+    DataType["INT8"],
+    DataType["INT7"],
+    DataType["INT3"],
+    # DataType["BIPOLAR"], # TODO - add support for bipolar weights
+    DataType["TERNARY"],
+]
+
+
+@pytest.mark.parametrize("wdt", weight_data_types)
+@pytest.mark.parametrize("idt", input_data_types)
+@pytest.mark.parametrize("tdt", thresh_data_types)
+@pytest.mark.parametrize("rww", [True, False])
+@pytest.mark.fpgadataflow
+def test_minimize_accumulator_width(wdt: DataType, idt: DataType, tdt: DataType, rww: bool):
+    """Testing MinimizeAccumulatorWidth for VVAU and MVAU.
+
+    :param wdt: (DataType) The data type that we are testing for the weights
+    :param idt: (DataType) The data type that we are testing for the activations
+    :param tdt: (DataType) The data type that we are testing for the thresholds
+    :param rww: (bool) Whether or not to use runtime-writeable weights"""
+    if (not wdt.signed()) or isinstance(wdt, BipolarType):
+        pytest.skip("Closed-form accumulator calculation is designed to consider signed weights")
+
+    # Create uniform-precision model
+    model = make_unit_test_model(wdt, idt, tdt)
+    def_adt = DataType["INT32"]
+
+    # If runtime-writeable weights, specify as a node attribute
+    for node in model.graph.node:
+        inst = getCustomOp(node)
+        if isinstance(inst, (MatrixVectorActivation, VectorVectorActivation)):
+            inst.set_nodeattr("runtime_writeable_weights", int(rww))
+            cur_adt = DataType[inst.get_nodeattr("accDataType")]
+            assert cur_adt.bitwidth() == def_adt.bitwidth(), "Default data type is incorrect"
+
+    # Apply the optimization
+    model = model.transform(MinimizeAccumulatorWidth())
+
+    # Iterate through each node to make sure it functioned properly
+    for node in model.graph.node:
+        inst = getCustomOp(node)
+        if isinstance(inst, (MatrixVectorActivation, VectorVectorActivation)):
+            cur_adt = DataType[inst.get_nodeattr("accDataType")]
+            cur_odt = DataType[inst.get_nodeattr("outputDataType")]
+            # Calculating expected accumulator bit width using a closed-form expression
+            # that is a slight over-approximation of the lower bound. The accumulator
+            # bit width minimization logic in the MVAU and VVAU is exact and should be
+            # less than or equal to this calculation
+            exp_adt = calculate_accumulator_bit_width(inst, model)
+            assert cur_adt.bitwidth() <= exp_adt.bitwidth(), "Mismatched accumulation data types"
+
+            # if there is no activation, outputDataType = accDataType and if it is the last node
+            # it needs to be divisible by 8
+            if inst.get_nodeattr("noActivation"):
+                assert (
+                    cur_adt.bitwidth() == cur_odt.bitwidth()
+                ), "outputDataType and accDataType should be equal"
+                if model.find_direct_successors(inst.onnx_node) is None:
+                    assert (
+                        cur_adt.bitwidth() % 8
+                    ) == 0, "bit width of last node needs to be divisible by 8"
diff --git a/tests/fpgadataflow/test_runtime_weights.py b/tests/fpgadataflow/test_runtime_weights.py
index 16fed5c3cb..9b2f418776 100644
--- a/tests/fpgadataflow/test_runtime_weights.py
+++ b/tests/fpgadataflow/test_runtime_weights.py
@@ -96,9 +96,7 @@ def test_runtime_weights_single_layer():
     def read_weights(sim):
         addr = 0
         for i in range(len(old_weight_stream)):
-            extracted_weight_stream.append(
-                axilite_read(sim, addr, basename="s_axilite_0_")
-            )
+            extracted_weight_stream.append(axilite_read(sim, addr, basename="s_axilite_0_"))
             addr += 4
 
     rtlsim_exec(model, exec_ctx, pre_hook=read_weights)
diff --git a/tests/fpgadataflow/test_set_folding.py b/tests/fpgadataflow/test_set_folding.py
index 8ea0e18f2c..ce9f4b12ed 100644
--- a/tests/fpgadataflow/test_set_folding.py
+++ b/tests/fpgadataflow/test_set_folding.py
@@ -34,6 +34,7 @@
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.general import GiveUniqueNodeNames
+from qonnx.util.basic import qonnx_make_model
 
 from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
 from finn.transformation.fpgadataflow.create_dataflow_partition import (
@@ -44,7 +45,6 @@
 
 
 def make_multi_fclayer_model(ch, wdt, adt, tdt, nnodes):
-
     W = np.random.randint(wdt.min(), wdt.max() + 1, size=(ch, ch))
     W = W.astype(np.float32)
 
@@ -54,9 +54,7 @@ def make_multi_fclayer_model(ch, wdt, adt, tdt, nnodes):
     tensors = []
     tensors.append(helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, ch]))
     for i in range(1, nnodes):
-        inter = helper.make_tensor_value_info(
-            "inter_" + str(i), TensorProto.FLOAT, [1, ch]
-        )
+        inter = helper.make_tensor_value_info("inter_" + str(i), TensorProto.FLOAT, [1, ch])
         tensors.append(inter)
     tensors.append(helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, ch]))
 
@@ -91,7 +89,7 @@ def make_multi_fclayer_model(ch, wdt, adt, tdt, nnodes):
         outputs=[tensors[-1]],
     )
 
-    model = helper.make_model(graph, producer_name="fclayer-model")
+    model = qonnx_make_model(graph, producer_name="fclayer-model")
     model = ModelWrapper(model)
 
     model.set_tensor_datatype("inp", adt)
@@ -114,10 +112,7 @@ def make_multi_fclayer_model(ch, wdt, adt, tdt, nnodes):
 @pytest.mark.parametrize("platform", ["Pynq-Z1", "Ultra96", "U200"])
 @pytest.mark.fpgadataflow
 def test_set_folding(target_fps, platform):
-
-    model = make_multi_fclayer_model(
-        128, DataType["INT4"], DataType["INT2"], DataType["INT16"], 5
-    )
+    model = make_multi_fclayer_model(128, DataType["INT4"], DataType["INT2"], DataType["INT16"], 5)
 
     model = model.transform(GiveUniqueNodeNames())
     parent_model = model.transform(CreateDataflowPartition())
diff --git a/tests/fpgadataflow/test_split_large_fifos.py b/tests/fpgadataflow/test_split_large_fifos.py
new file mode 100644
index 0000000000..3061696a68
--- /dev/null
+++ b/tests/fpgadataflow/test_split_large_fifos.py
@@ -0,0 +1,125 @@
+# Copyright (C) 2022, Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of Xilinx nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+import pytest
+
+import json
+import shutil
+import torch
+from brevitas.export import export_qonnx
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.registry import getCustomOp
+
+import finn.builder.build_dataflow as build
+import finn.builder.build_dataflow_config as build_cfg
+from finn.transformation.fpgadataflow.set_fifo_depths import get_fifo_split_configs
+from finn.util.basic import make_build_dir
+from finn.util.test import get_trained_network_and_ishape
+
+
+def fetch_test_model(topology, wbits=2, abits=2):
+    tmp_output_dir = make_build_dir("build_fifosizing_%s_" % topology)
+    (model, ishape) = get_trained_network_and_ishape(topology, wbits, abits)
+    chkpt_name = tmp_output_dir + "/model.onnx"
+    export_qonnx(model, torch.randn(ishape), chkpt_name)
+    return tmp_output_dir
+
+
+def get_folding_cfg(depth=65536):
+    cfg = dict()
+    cfg["Defaults"] = dict()
+    for i in range(3):
+        key = "StreamingFIFO_" + str(i)
+        cfg[key] = {"depth": depth, "ram_style": "auto", "impl_style": "vivado"}
+    return cfg
+
+
+@pytest.mark.slow
+@pytest.mark.vivado
+@pytest.mark.fpgadataflow
+@pytest.mark.parametrize("depth", [16384, 65536, 45000])
+@pytest.mark.parametrize("force_python_rtlsim", ["True", "False"])
+def test_split_large_fifos(depth, force_python_rtlsim):
+    tmp_output_dir = fetch_test_model("tfc")
+    folding_cfg = get_folding_cfg(depth)
+    with open(tmp_output_dir + "/folding_config.json", "w") as f:
+        json.dump(folding_cfg, f, indent=2)
+    cfg = build_cfg.DataflowBuildConfig(
+        output_dir=tmp_output_dir,
+        auto_fifo_depths=False,
+        split_large_fifos=True,
+        folding_config_file=tmp_output_dir + "/folding_config.json",
+        target_fps=10000,
+        force_python_rtlsim=force_python_rtlsim,
+        synth_clk_period_ns=10.0,
+        board="Pynq-Z1",
+        rtlsim_batch_size=100,
+        shell_flow_type=build_cfg.ShellFlowType.VIVADO_ZYNQ,
+        generate_outputs=[
+            build_cfg.DataflowOutputType.ESTIMATE_REPORTS,
+            build_cfg.DataflowOutputType.STITCHED_IP,
+            build_cfg.DataflowOutputType.RTLSIM_PERFORMANCE,
+        ],
+        default_mem_mode=build_cfg.ComputeEngineMemMode.DECOUPLED,
+    )
+    build.build_dataflow_cfg(tmp_output_dir + "/model.onnx", cfg)
+    with open(tmp_output_dir + "/report/estimate_network_performance.json") as f:
+        est_data = json.load(f)
+    with open(tmp_output_dir + "/report/rtlsim_performance.json") as f:
+        sim_data = json.load(f)
+    assert (
+        float(sim_data["throughput[images/s]"]) / float(est_data["estimated_throughput_fps"]) > 0.9
+    )
+    model = ModelWrapper(tmp_output_dir + "/intermediate_models/step_set_fifo_depths.onnx")
+    # exclude final FIFO node (output FIFO, not part of test)
+    fifo_nodes = model.get_nodes_by_op_type("StreamingFIFO")[:-1]
+    golden_cfg = get_fifo_split_configs(depth, 256, 32768)
+    for i, fifo_node in enumerate(fifo_nodes):
+        inst = getCustomOp(fifo_node)
+        fifo_depth = inst.get_nodeattr("depth")
+        assert fifo_depth == golden_cfg[i % len(golden_cfg)][0]
+
+    shutil.rmtree(tmp_output_dir)
+
+
+def test_split_large_fifo_configs():
+    ret0 = get_fifo_split_configs(513, 256, 32768)
+    assert ret0 == [(512, "vivado"), (1, "rtl")]
+    ret1 = get_fifo_split_configs(1200, 256, 32768)
+    assert ret1 == [(1024, "vivado"), (176, "rtl")]
+    ret2 = get_fifo_split_configs(45000, 256, 32768)
+    assert ret2 == [
+        (32768, "vivado"),
+        (8192, "vivado"),
+        (2048, "vivado"),
+        (1024, "vivado"),
+        (512, "vivado"),
+        (256, "rtl"),
+        (200, "rtl"),
+    ]
diff --git a/tests/notebooks/test_jupyter_notebooks.py b/tests/notebooks/test_jupyter_notebooks.py
new file mode 100644
index 0000000000..e1415b9066
--- /dev/null
+++ b/tests/notebooks/test_jupyter_notebooks.py
@@ -0,0 +1,51 @@
+import pytest
+
+import nbformat
+from nbconvert.preprocessors import ExecutePreprocessor
+
+from finn.util.basic import get_finn_root
+
+notebook_timeout_seconds = 3600
+notebook_basic_dir = get_finn_root() + "/notebooks/basics/"
+notebook_advanced_dir = get_finn_root() + "/notebooks/advanced/"
+notebook_cyber_dir = get_finn_root() + "/notebooks/end2end_example/cybersecurity/"
+notebook_bnn_dir = get_finn_root() + "/notebooks/end2end_example/bnn-pynq/"
+
+basics_notebooks = [
+    pytest.param(notebook_basic_dir + "0_how_to_work_with_onnx.ipynb"),
+    pytest.param(notebook_basic_dir + "1_brevitas_network_import_via_QONNX.ipynb"),
+]
+
+advanced_notebooks = [
+    pytest.param(notebook_advanced_dir + "0_custom_analysis_pass.ipynb"),
+    pytest.param(notebook_advanced_dir + "1_custom_transformation_pass.ipynb"),
+    pytest.param(notebook_advanced_dir + "2_custom_op.ipynb"),
+    pytest.param(notebook_advanced_dir + "3_folding.ipynb"),
+    pytest.param(notebook_advanced_dir + "4_advanced_builder_settings.ipynb"),
+]
+
+cyber_notebooks = [
+    pytest.param(notebook_cyber_dir + "1-train-mlp-with-brevitas.ipynb"),
+    pytest.param(notebook_cyber_dir + "2-import-into-finn-and-verify.ipynb"),
+    pytest.param(notebook_cyber_dir + "3-build-accelerator-with-finn.ipynb"),
+]
+
+bnn_notebooks = [
+    pytest.param(notebook_bnn_dir + "cnv_end2end_example.ipynb"),
+    pytest.param(notebook_bnn_dir + "tfc_end2end_example.ipynb"),
+    pytest.param(notebook_bnn_dir + "tfc_end2end_verification.ipynb"),
+]
+
+
+@pytest.mark.notebooks
+@pytest.mark.parametrize(
+    "notebook", basics_notebooks + advanced_notebooks + cyber_notebooks + bnn_notebooks
+)
+def test_notebook_exec(notebook):
+    with open(notebook) as f:
+        nb = nbformat.read(f, as_version=4)
+        ep = ExecutePreprocessor(timeout=notebook_timeout_seconds, kernel_name="python3")
+        try:
+            assert ep.preprocess(nb) is not None, f"Got empty notebook for {notebook}"
+        except Exception:
+            assert False, f"Failed executing {notebook}"
diff --git a/tests/transformation/streamline/test_absorb_mul_into_topk.py b/tests/transformation/streamline/test_absorb_mul_into_topk.py
index a6dff788dc..1ca8fb06e9 100644
--- a/tests/transformation/streamline/test_absorb_mul_into_topk.py
+++ b/tests/transformation/streamline/test_absorb_mul_into_topk.py
@@ -34,6 +34,7 @@
 from qonnx.transformation.infer_datatypes import InferDataTypes
 from qonnx.transformation.infer_shapes import InferShapes
 from qonnx.transformation.insert_topk import InsertTopK
+from qonnx.util.basic import qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 from finn.transformation.streamline.absorb import AbsorbScalarMulAddIntoTopK
@@ -65,23 +66,17 @@ def test_absorb_mul_into_topk(mul_positive, scalar):
         value_info=[a0, b0, c0],
     )
 
-    model = helper.make_model(mul_graph, producer_name="mul_model")
+    model = qonnx_make_model(mul_graph, producer_name="mul_model")
     model = ModelWrapper(model)
     # initialize values
     # for mul
     if mul_positive is True:
-        a0_values = np.random.uniform(low=0.1, high=1, size=tuple(shape)).astype(
-            np.float32
-        )
+        a0_values = np.random.uniform(low=0.1, high=1, size=tuple(shape)).astype(np.float32)
     else:
-        a0_values = np.random.uniform(low=-1, high=-0.1, size=tuple(shape)).astype(
-            np.float32
-        )
+        a0_values = np.random.uniform(low=-1, high=-0.1, size=tuple(shape)).astype(np.float32)
     model.set_initializer("a0", a0_values)
     # for add
-    c0_values = np.random.uniform(low=-1, high=-0.1, size=tuple(shape)).astype(
-        np.float32
-    )
+    c0_values = np.random.uniform(low=-1, high=-0.1, size=tuple(shape)).astype(np.float32)
     model.set_initializer("c0", c0_values)
     model = model.transform(InsertTopK())
     model = model.transform(InferShapes())
@@ -91,9 +86,7 @@ def test_absorb_mul_into_topk(mul_positive, scalar):
     model_transformed = model.transform(AbsorbScalarMulAddIntoTopK())
 
     # compare execution results
-    inp_values = np.random.uniform(low=-10, high=10, size=(1, 1, 1, 1000)).astype(
-        np.float32
-    )
+    inp_values = np.random.uniform(low=-10, high=10, size=(1, 1, 1, 1000)).astype(np.float32)
     idict = {"global_in": inp_values}
     odict = oxe.execute_onnx(model, idict, True)
     y_indices = odict["global_out"]
diff --git a/tests/transformation/streamline/test_absorb_opposite_transposes.py b/tests/transformation/streamline/test_absorb_opposite_transposes.py
index 51ea5edfc4..6d8d2b9f0c 100644
--- a/tests/transformation/streamline/test_absorb_opposite_transposes.py
+++ b/tests/transformation/streamline/test_absorb_opposite_transposes.py
@@ -29,8 +29,7 @@
 import pytest
 
 import numpy as np
-import onnx.helper as oh
-from onnx import TensorProto
+import onnx.parser as oprs
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.transformation.infer_shapes import InferShapes
 
@@ -41,39 +40,42 @@
 @pytest.mark.streamline
 def test_absorb_opposite_transposes():
     np.random.seed(0)
-    input_shape = [1, 3, 4, 2]
-    top_in = oh.make_tensor_value_info("top_in", TensorProto.FLOAT, input_shape)
-    top_out = oh.make_tensor_value_info("top_out", TensorProto.FLOAT, input_shape)
-    value_info = [oh.make_tensor_value_info("add_param_0", TensorProto.FLOAT, [1])]
-    value_info += [oh.make_tensor_value_info("add_param_1", TensorProto.FLOAT, [1])]
-    value_info += [oh.make_tensor_value_info("mul_param_0", TensorProto.FLOAT, [1])]
-    modelproto = oh.make_model(
-        oh.make_graph(
-            name="test",
-            inputs=[top_in],
-            outputs=[top_out],
-            value_info=value_info,
-            nodes=[
-                oh.make_node("Add", ["top_in", "add_param_0"], ["t0"]),
-                oh.make_node("Transpose", ["t0"], ["t1"], perm=[0, 2, 3, 1]),
-                oh.make_node("Transpose", ["t1"], ["t2"], perm=[0, 3, 1, 2]),
-                oh.make_node("Add", ["t2", "add_param_1"], ["t3"]),
-                oh.make_node("Transpose", ["t3"], ["t4"], perm=[0, 2, 3, 1]),
-                oh.make_node("Transpose", ["t4"], ["t5"], perm=[0, 3, 1, 2]),
-                oh.make_node("Add", ["t5", "t2"], ["t6"]),
-                oh.make_node("Mul", ["t6", "mul_param_0"], ["top_out"]),
-            ],
-        )
-    )
-    model = ModelWrapper(modelproto)
+    shp = [1, 3, 4, 2]
+    shp_str = str(shp)
+    input = f"""
+    <
+        ir_version: 7,
+        opset_import: ["" : 9]
+    >
+    agraph (float{shp_str} in0) => (float{shp_str} out0)
+    <
+        float[1] add0_param = {{1.0}},
+        float[1] add1_param = {{3.0}},
+        float[1] mul0_param = {{2.0}}
+    >
+    {{
+        add0_out = Add(in0, add0_param)
+        t0_out = Transpose<perm=[0,2,3,1]>(add0_out)
+        t1_out = Transpose<perm=[0,3,1,2]>(t0_out)
+        add1_out = Add(t1_out, add1_param)
+        t2_out = Transpose<perm=[0,2,3,1]>(add1_out)
+        t3_out = Transpose<perm=[0,3,1,2]>(t2_out)
+        add2_out = Add(t1_out, t3_out)
+        t4_out = Transpose<perm=[0,2,3,1]>(add2_out)
+        t5_out = Transpose<perm=[0,3,1,2]>(t4_out)
+        t6_out = Transpose<perm=[0,3,1,2]>(t4_out)
+        m0_out = Mul(t5_out, mul0_param)
+        m1_out = Mul(t6_out, mul0_param)
+        out0 = Mul(m0_out, m1_out)
+    }}
+    """
+    model = oprs.parse_model(input)
+    model = ModelWrapper(model)
     model = model.transform(InferShapes())
-    model.set_initializer("add_param_0", np.asarray([1], dtype=np.float32))
-    model.set_initializer("add_param_1", np.asarray([3], dtype=np.float32))
-    model.set_initializer("mul_param_0", np.asarray([2], dtype=np.float32))
     new_model = model.transform(AbsorbConsecutiveTransposes())
     new_model = new_model.transform(InferShapes())
-    inp_dict = {"top_in": np.random.rand(*input_shape).astype(np.float32)}
+    inp_dict = {"top_in": np.random.rand(*shp).astype(np.float32)}
     assert ox.compare_execution(model, model, inp_dict)
-    assert len(new_model.graph.node) == 4
+    assert len(new_model.graph.node) == 6
     for n in new_model.graph.node:
         assert new_model.graph.node[0].op_type != "Transpose"
diff --git a/tests/transformation/streamline/test_absorb_transp_into_flatten.py b/tests/transformation/streamline/test_absorb_transp_into_flatten.py
index 1358d468c0..5b278bd552 100644
--- a/tests/transformation/streamline/test_absorb_transp_into_flatten.py
+++ b/tests/transformation/streamline/test_absorb_transp_into_flatten.py
@@ -8,6 +8,7 @@
 from qonnx.transformation.infer_data_layouts import InferDataLayouts
 from qonnx.transformation.infer_datatypes import InferDataTypes
 from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.util.basic import qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 from finn.transformation.streamline.absorb import AbsorbTransposeIntoFlatten
@@ -45,7 +46,7 @@ def test_absorb_transp_into_flatten(perm, shape, ishape, data_layout):
         outputs=[outp],
     )
 
-    model = helper.make_model(graph, producer_name="absorb_transpose_model")
+    model = qonnx_make_model(graph, producer_name="absorb_transpose_model")
     model = ModelWrapper(model)
     if shape is not None:
         model.graph.value_info.append(shape0)
@@ -64,9 +65,7 @@ def test_absorb_transp_into_flatten(perm, shape, ishape, data_layout):
     # model_transformed.save("test2.onnx")
 
     # verify transformation
-    inp_values = np.random.uniform(low=-1, high=1, size=tuple(ishape)).astype(
-        np.float32
-    )
+    inp_values = np.random.uniform(low=-1, high=1, size=tuple(ishape)).astype(np.float32)
     idict = {model.graph.input[0].name: inp_values}
     assert oxe.compare_execution(model, model_transformed, idict)
 
diff --git a/tests/transformation/streamline/test_collapse_repeated_op.py b/tests/transformation/streamline/test_collapse_repeated_op.py
index 268e0ffc5c..c1d3ee0088 100644
--- a/tests/transformation/streamline/test_collapse_repeated_op.py
+++ b/tests/transformation/streamline/test_collapse_repeated_op.py
@@ -33,6 +33,7 @@
 from onnx import TensorProto
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.util.basic import qonnx_make_model
 
 import finn.core.onnx_exec as ox
 from finn.transformation.streamline import CollapseRepeatedAdd, CollapseRepeatedMul
@@ -46,7 +47,7 @@ def test_collapse_repeated_op():
     add_param_1 = oh.make_tensor_value_info("add_param_1", TensorProto.FLOAT, [2])
     mul_param_1 = oh.make_tensor_value_info("mul_param_1", TensorProto.FLOAT, [2])
     top_out = oh.make_tensor_value_info("top_out", TensorProto.FLOAT, [2])
-    modelproto = oh.make_model(
+    modelproto = qonnx_make_model(
         oh.make_graph(
             name="test",
             inputs=[top_in],
@@ -96,7 +97,7 @@ def test_collapse_repeated_only_if_linear(test_args):
     value_info += [oh.make_tensor_value_info("p4", TensorProto.FLOAT, [1])]
     value_info += [oh.make_tensor_value_info("p5", TensorProto.FLOAT, [1])]
 
-    modelproto = oh.make_model(
+    modelproto = qonnx_make_model(
         oh.make_graph(
             name="test",
             inputs=[top_in],
diff --git a/tests/transformation/streamline/test_factor_out_mul_sign_magnitude.py b/tests/transformation/streamline/test_factor_out_mul_sign_magnitude.py
index 04ab9bf0b9..89596a1c0f 100644
--- a/tests/transformation/streamline/test_factor_out_mul_sign_magnitude.py
+++ b/tests/transformation/streamline/test_factor_out_mul_sign_magnitude.py
@@ -33,6 +33,7 @@
 from onnx import TensorProto
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.util.basic import qonnx_make_model
 
 import finn.core.onnx_exec as ox
 from finn.transformation.streamline import FactorOutMulSignMagnitude
@@ -43,7 +44,7 @@ def test_factor_out_mul_sign_magnitude():
     top_in = oh.make_tensor_value_info("top_in", TensorProto.FLOAT, [1, 2])
     mul_param = oh.make_tensor_value_info("mul_param", TensorProto.FLOAT, [1, 2])
     top_out = oh.make_tensor_value_info("top_out", TensorProto.FLOAT, [1, 2])
-    modelproto = oh.make_model(
+    modelproto = qonnx_make_model(
         oh.make_graph(
             name="test",
             inputs=[top_in],
diff --git a/tests/transformation/streamline/test_linear_past_eltwise.py b/tests/transformation/streamline/test_linear_past_eltwise.py
index 12633d750b..70fc395652 100644
--- a/tests/transformation/streamline/test_linear_past_eltwise.py
+++ b/tests/transformation/streamline/test_linear_past_eltwise.py
@@ -35,6 +35,7 @@
 from qonnx.transformation.fold_constants import FoldConstants
 from qonnx.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
 from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.util.basic import qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 from finn.transformation.streamline.reorder import MoveLinearPastEltwiseAdd
@@ -62,15 +63,9 @@ def make_model(shape):
 
     add1_node = helper.make_node("Add", [inp1.name, inp1_add_ct.name], [inp1_add.name])
     add2_node = helper.make_node("Add", [inp2.name, inp2_add_ct.name], [inp2_add.name])
-    mul1_node = helper.make_node(
-        "Mul", [inp1_add.name, inp1_mul_ct.name], [inp1_mul.name]
-    )
-    mul2_node = helper.make_node(
-        "Mul", [inp2_add.name, inp2_mul_ct.name], [inp2_mul.name]
-    )
-    eltwise_add_node = helper.make_node(
-        "Add", [inp1_mul.name, inp2_mul.name], [outp.name]
-    )
+    mul1_node = helper.make_node("Mul", [inp1_add.name, inp1_mul_ct.name], [inp1_mul.name])
+    mul2_node = helper.make_node("Mul", [inp2_add.name, inp2_mul_ct.name], [inp2_mul.name])
+    eltwise_add_node = helper.make_node("Add", [inp1_mul.name, inp2_mul.name], [outp.name])
     graph = helper.make_graph(
         nodes=[add1_node, add2_node, mul1_node, mul2_node, eltwise_add_node],
         name="graph",
@@ -78,7 +73,7 @@ def make_model(shape):
         outputs=[outp],
     )
 
-    model = helper.make_model(graph, producer_name="add-model")
+    model = qonnx_make_model(graph, producer_name="add-model")
     model = ModelWrapper(model)
 
     # set initializers for scalar add/mul nodes
@@ -152,11 +147,9 @@ def test_linear_past_eltwise_add_multiple_forks(ch, ifmdim):
     num_of_params = 6
     value_info = []
     for i in range(num_of_params):
-        value_info += [
-            helper.make_tensor_value_info("p" + str(i), TensorProto.FLOAT, input_shape)
-        ]
+        value_info += [helper.make_tensor_value_info("p" + str(i), TensorProto.FLOAT, input_shape)]
 
-    modelproto = helper.make_model(
+    modelproto = qonnx_make_model(
         helper.make_graph(
             name="test",
             inputs=[top_in],
@@ -179,9 +172,7 @@ def test_linear_past_eltwise_add_multiple_forks(ch, ifmdim):
 
     np.random.seed(0)
     for i in range(num_of_params):
-        model.set_initializer(
-            "p" + str(i), np.random.rand(*input_shape).astype(np.float32)
-        )
+        model.set_initializer("p" + str(i), np.random.rand(*input_shape).astype(np.float32))
 
     # need equal mults:
     model.set_initializer("p2", model.get_initializer("p1"))
diff --git a/tests/transformation/streamline/test_maxpool_nhwc.py b/tests/transformation/streamline/test_maxpool_nhwc.py
index aa77b5cf1a..77dbf3a971 100644
--- a/tests/transformation/streamline/test_maxpool_nhwc.py
+++ b/tests/transformation/streamline/test_maxpool_nhwc.py
@@ -7,28 +7,20 @@
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.custom_op.general.maxpoolnhwc import compute_pool_output_dim
 from qonnx.transformation.infer_shapes import InferShapes
-from qonnx.util.basic import gen_finn_dt_tensor
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 from finn.transformation.streamline.reorder import MakeMaxPoolNHWC
 
 
 def create_maxpool(ifm_dim, ifm_ch, kernel_shape, pads, strides, ceil_mode, idt):
-    ofm_dim_h = compute_pool_output_dim(
-        ifm_dim[0], kernel_shape[0], strides[0], pads[0], ceil_mode
-    )
-    ofm_dim_w = compute_pool_output_dim(
-        ifm_dim[1], kernel_shape[1], strides[1], pads[1], ceil_mode
-    )
-    inp = oh.make_tensor_value_info(
-        "inp", TensorProto.FLOAT, [1, ifm_ch, ifm_dim[0], ifm_dim[1]]
-    )
+    ofm_dim_h = compute_pool_output_dim(ifm_dim[0], kernel_shape[0], strides[0], pads[0], ceil_mode)
+    ofm_dim_w = compute_pool_output_dim(ifm_dim[1], kernel_shape[1], strides[1], pads[1], ceil_mode)
+    inp = oh.make_tensor_value_info("inp", TensorProto.FLOAT, [1, ifm_ch, ifm_dim[0], ifm_dim[1]])
     outp_mp = oh.make_tensor_value_info(
         "outp_mp", TensorProto.FLOAT, [1, ifm_ch, ofm_dim_h, ofm_dim_w]
     )
-    outp = oh.make_tensor_value_info(
-        "outp", TensorProto.FLOAT, [1, ofm_dim_h, ofm_dim_w, ifm_ch]
-    )
+    outp = oh.make_tensor_value_info("outp", TensorProto.FLOAT, [1, ofm_dim_h, ofm_dim_w, ifm_ch])
 
     maxpool_node = oh.make_node(
         "MaxPool",
@@ -56,7 +48,7 @@ def create_maxpool(ifm_dim, ifm_ch, kernel_shape, pads, strides, ceil_mode, idt)
         value_info=[outp_mp],
     )
 
-    model = oh.make_model(graph, producer_name="maxpool_model")
+    model = qonnx_make_model(graph, producer_name="maxpool_model")
     model = ModelWrapper(model)
     model.set_tensor_datatype("inp", idt)
     model.set_tensor_datatype("outp", idt)
@@ -83,9 +75,7 @@ def create_maxpool(ifm_dim, ifm_ch, kernel_shape, pads, strides, ceil_mode, idt)
 @pytest.mark.parametrize("idt", [DataType["INT4"]])
 def test_maxpool_nhwc(ifm_dim, ifm_ch, kernel_shape, pads, strides, ceil_mode, idt):
     # create MaxPool node
-    maxpool_model = create_maxpool(
-        ifm_dim, ifm_ch, kernel_shape, pads, strides, ceil_mode, idt
-    )
+    maxpool_model = create_maxpool(ifm_dim, ifm_ch, kernel_shape, pads, strides, ceil_mode, idt)
 
     # generate input tensor for testing
     input_tensor = gen_finn_dt_tensor(idt, [1, ifm_ch, ifm_dim[0], ifm_dim[1]])
@@ -100,9 +90,7 @@ def test_maxpool_nhwc(ifm_dim, ifm_ch, kernel_shape, pads, strides, ceil_mode, i
 
     # execute transformed model
     output_node_name = maxpool_model.graph.output[0].name
-    output_dict = oxe.execute_onnx(
-        maxpool_model, input_dict, return_full_exec_context=False
-    )
+    output_dict = oxe.execute_onnx(maxpool_model, input_dict, return_full_exec_context=False)
     output = output_dict[output_node_name]
 
     # compare outputs
diff --git a/tests/transformation/streamline/test_move_add_past_mul.py b/tests/transformation/streamline/test_move_add_past_mul.py
index 0fb4dd9f7a..ea9c2a954d 100644
--- a/tests/transformation/streamline/test_move_add_past_mul.py
+++ b/tests/transformation/streamline/test_move_add_past_mul.py
@@ -33,6 +33,7 @@
 from onnx import TensorProto
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.util.basic import qonnx_make_model
 
 import finn.core.onnx_exec as ox
 from finn.transformation.streamline import MoveAddPastMul
@@ -44,7 +45,7 @@ def test_move_add_past_mul_single():
     add_param = oh.make_tensor_value_info("add_param", TensorProto.FLOAT, [2])
     mul_param = oh.make_tensor_value_info("mul_param", TensorProto.FLOAT, [2])
     top_out = oh.make_tensor_value_info("top_out", TensorProto.FLOAT, [2])
-    modelproto = oh.make_model(
+    modelproto = qonnx_make_model(
         oh.make_graph(
             name="test",
             inputs=[top_in],
@@ -76,7 +77,7 @@ def test_move_add_past_mul_multi():
     add_param_1 = oh.make_tensor_value_info("add_param_1", TensorProto.FLOAT, [2])
     mul_param_1 = oh.make_tensor_value_info("mul_param_1", TensorProto.FLOAT, [2])
     top_out = oh.make_tensor_value_info("top_out", TensorProto.FLOAT, [2])
-    modelproto = oh.make_model(
+    modelproto = qonnx_make_model(
         oh.make_graph(
             name="test",
             inputs=[top_in],
@@ -116,7 +117,7 @@ def test_move_add_past_mul_only_if_linear():
     value_info += [oh.make_tensor_value_info("mul1_param", TensorProto.FLOAT, [1])]
     value_info += [oh.make_tensor_value_info("mul2_param", TensorProto.FLOAT, [1])]
     value_info += [oh.make_tensor_value_info("mul3_param", TensorProto.FLOAT, [1])]
-    modelproto = oh.make_model(
+    modelproto = qonnx_make_model(
         oh.make_graph(
             name="test",
             inputs=[top_in],
diff --git a/tests/transformation/streamline/test_move_chw_add_past_conv.py b/tests/transformation/streamline/test_move_chw_add_past_conv.py
index 7eb7f9f1af..8b2f10b658 100644
--- a/tests/transformation/streamline/test_move_chw_add_past_conv.py
+++ b/tests/transformation/streamline/test_move_chw_add_past_conv.py
@@ -33,6 +33,7 @@
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.custom_op.general.im2col import compute_conv_output_dim
 from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.util.basic import qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 from finn.transformation.streamline.reorder import MoveAddPastConv
@@ -72,7 +73,7 @@ def test_move_chw_add_past_conv(idim, k, s, ich, och):
     add_node = helper.make_node("Add", ["inp", "a0"], ["add_out"])
     conv_node = helper.make_node("Conv", ["add_out", "a1"], ["outp"], **conv_config)
 
-    model = helper.make_model(
+    model = qonnx_make_model(
         helper.make_graph(
             nodes=[add_node, conv_node],
             name="move-add-graph",
@@ -84,13 +85,9 @@ def test_move_chw_add_past_conv(idim, k, s, ich, och):
 
     model = ModelWrapper(model)
     # initialize model
-    a0_values = np.random.uniform(low=0, high=1, size=tuple(add_param_shape)).astype(
-        np.float32
-    )
+    a0_values = np.random.uniform(low=0, high=1, size=tuple(add_param_shape)).astype(np.float32)
     model.set_initializer("a0", a0_values)
-    a1_values = np.random.uniform(low=0, high=1, size=tuple(conv_param_shape)).astype(
-        np.float32
-    )
+    a1_values = np.random.uniform(low=0, high=1, size=tuple(conv_param_shape)).astype(np.float32)
     model.set_initializer("a1", a1_values)
 
     model = model.transform(InferShapes())
diff --git a/tests/transformation/streamline/test_move_flatten_past_affine.py b/tests/transformation/streamline/test_move_flatten_past_affine.py
index 8c3f71d1f3..22c5e19fac 100644
--- a/tests/transformation/streamline/test_move_flatten_past_affine.py
+++ b/tests/transformation/streamline/test_move_flatten_past_affine.py
@@ -36,7 +36,7 @@
 from qonnx.transformation.infer_data_layouts import InferDataLayouts
 from qonnx.transformation.infer_datatypes import InferDataTypes
 from qonnx.transformation.infer_shapes import InferShapes
-from qonnx.util.basic import gen_finn_dt_tensor
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 from finn.transformation.streamline.reorder import MoveFlattenPastAffine
@@ -74,7 +74,7 @@ def test_move_flatten_past_affine(data_layout, batch_size):
         value_info=[a0, a1, a2],
     )
 
-    model = helper.make_model(graph, producer_name="move_reshape_model")
+    model = qonnx_make_model(graph, producer_name="move_reshape_model")
     model = ModelWrapper(model)
 
     # initialize values
diff --git a/tests/transformation/streamline/test_move_flatten_past_topk.py b/tests/transformation/streamline/test_move_flatten_past_topk.py
index 83d7a28c05..82336cd3e6 100644
--- a/tests/transformation/streamline/test_move_flatten_past_topk.py
+++ b/tests/transformation/streamline/test_move_flatten_past_topk.py
@@ -36,7 +36,7 @@
 from qonnx.transformation.infer_datatypes import InferDataTypes
 from qonnx.transformation.infer_shapes import InferShapes
 from qonnx.transformation.insert_topk import InsertTopK
-from qonnx.util.basic import gen_finn_dt_tensor
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 from finn.transformation.streamline.reorder import MoveFlattenPastTopK
@@ -47,7 +47,7 @@
 @pytest.mark.parametrize("data_layout", [DataLayout.NHWC, DataLayout.NCHW])
 # batch size
 @pytest.mark.parametrize("batch_size", [1, 2])
-def test_move_flatten_past_affine(data_layout, batch_size):
+def test_move_flatten_past_topk(data_layout, batch_size):
     if data_layout == DataLayout.NHWC:
         ishape = [batch_size, 1, 1, 1024]
         oshape = [batch_size, 1024]
@@ -67,7 +67,7 @@ def test_move_flatten_past_affine(data_layout, batch_size):
         outputs=[outp],
     )
 
-    model = helper.make_model(graph, producer_name="move_flatten_model")
+    model = qonnx_make_model(graph, producer_name="move_flatten_model")
     model = ModelWrapper(model)
 
     model.set_tensor_datatype("inp", DataType["INT2"])
diff --git a/tests/transformation/streamline/test_move_identical_op_past_join_op.py b/tests/transformation/streamline/test_move_identical_op_past_join_op.py
index 4986363ff4..dd83681fc2 100644
--- a/tests/transformation/streamline/test_move_identical_op_past_join_op.py
+++ b/tests/transformation/streamline/test_move_identical_op_past_join_op.py
@@ -30,7 +30,7 @@
 from onnx import TensorProto
 from onnx import helper as oh
 from qonnx.core.modelwrapper import ModelWrapper
-from qonnx.util.basic import gen_finn_dt_tensor
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 from finn.transformation.streamline.reorder import MoveTransposePastJoinAdd
@@ -56,18 +56,10 @@ def create_model(perm):
         "Add", inputs=["out_transpose1", "out_transpose2"], outputs=["out_join1"]
     )
 
-    in_transpose1 = oh.make_tensor_value_info(
-        "in_transpose1", TensorProto.FLOAT, in_shape
-    )
-    in_transpose2 = oh.make_tensor_value_info(
-        "in_transpose2", TensorProto.FLOAT, in_shape
-    )
-    out_transpose1 = oh.make_tensor_value_info(
-        "out_transpose1", TensorProto.FLOAT, out_shape
-    )
-    out_transpose2 = oh.make_tensor_value_info(
-        "out_transpose2", TensorProto.FLOAT, out_shape
-    )
+    in_transpose1 = oh.make_tensor_value_info("in_transpose1", TensorProto.FLOAT, in_shape)
+    in_transpose2 = oh.make_tensor_value_info("in_transpose2", TensorProto.FLOAT, in_shape)
+    out_transpose1 = oh.make_tensor_value_info("out_transpose1", TensorProto.FLOAT, out_shape)
+    out_transpose2 = oh.make_tensor_value_info("out_transpose2", TensorProto.FLOAT, out_shape)
     out_join1 = oh.make_tensor_value_info("out_join1", TensorProto.FLOAT, out_shape)
 
     graph = oh.make_graph(
@@ -81,7 +73,7 @@ def create_model(perm):
         ],
     )
 
-    onnx_model = oh.make_model(graph, producer_name="test_model")
+    onnx_model = qonnx_make_model(graph, producer_name="test_model")
     model = ModelWrapper(onnx_model)
 
     return model
diff --git a/tests/transformation/streamline/test_move_maxpool_past_multithreshold.py b/tests/transformation/streamline/test_move_maxpool_past_multithreshold.py
index bf25eee9e6..2dee153545 100644
--- a/tests/transformation/streamline/test_move_maxpool_past_multithreshold.py
+++ b/tests/transformation/streamline/test_move_maxpool_past_multithreshold.py
@@ -32,6 +32,7 @@
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.transformation.infer_datatypes import InferDataTypes
 from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.util.basic import qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 from finn.transformation.streamline.reorder import MoveMaxPoolPastMultiThreshold
@@ -66,14 +67,10 @@ def test_move_maxpool_past_multithreshold():
 
     value_info = []
     thres1_shape = [1, 1]
-    value_info += [
-        helper.make_tensor_value_info("thres1", TensorProto.FLOAT, thres1_shape)
-    ]
+    value_info += [helper.make_tensor_value_info("thres1", TensorProto.FLOAT, thres1_shape)]
 
     thres2_shape = [ch, 14]
-    value_info += [
-        helper.make_tensor_value_info("thres2", TensorProto.FLOAT, thres2_shape)
-    ]
+    value_info += [helper.make_tensor_value_info("thres2", TensorProto.FLOAT, thres2_shape)]
 
     nodes = []
     nodes += [helper.make_node("MaxPool", ["top_in"], ["t1"], **maxpool_config)]
@@ -99,7 +96,7 @@ def test_move_maxpool_past_multithreshold():
         )
     ]
 
-    modelproto = helper.make_model(
+    modelproto = qonnx_make_model(
         helper.make_graph(
             name="test",
             inputs=[top_in],
@@ -113,9 +110,7 @@ def test_move_maxpool_past_multithreshold():
     model = model.transform(InferDataTypes())
 
     model.set_initializer("thres1", np.array([[0]], dtype=np.float32))
-    model.set_initializer(
-        "thres2", get_multithreshold_rand_params(*thres2_shape, seed=0)
-    )
+    model.set_initializer("thres2", get_multithreshold_rand_params(*thres2_shape, seed=0))
 
     # Transform
     new_model = model.transform(MoveMaxPoolPastMultiThreshold())
diff --git a/tests/transformation/streamline/test_move_mul_past_dw_conv.py b/tests/transformation/streamline/test_move_mul_past_dw_conv.py
index 401631a728..303b97c69f 100644
--- a/tests/transformation/streamline/test_move_mul_past_dw_conv.py
+++ b/tests/transformation/streamline/test_move_mul_past_dw_conv.py
@@ -33,7 +33,7 @@
 from qonnx.custom_op.general.im2col import compute_conv_output_dim
 from qonnx.transformation.infer_datatypes import InferDataTypes
 from qonnx.transformation.infer_shapes import InferShapes
-from qonnx.util.basic import gen_finn_dt_tensor
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 from finn.transformation.streamline.reorder import MoveMulPastDWConv
@@ -65,14 +65,10 @@ def test_move_mul_past_dw_conv(ifm_dim, ifm_ch, k, stride, pad_amt, dw):
     ofm_dim = compute_conv_output_dim(ifm_dim, k, stride, total_pad)
 
     # set up onnx model
-    inp = helper.make_tensor_value_info(
-        "inp", TensorProto.FLOAT, [1, ifm_ch, ifm_dim, ifm_dim]
-    )
+    inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, ifm_ch, ifm_dim, ifm_dim])
     mul = helper.make_tensor_value_info("mul", TensorProto.FLOAT, [1, ifm_ch, 1, 1])
     W = helper.make_tensor_value_info("W", TensorProto.FLOAT, W_shape)
-    outp = helper.make_tensor_value_info(
-        "outp", TensorProto.FLOAT, [1, ofm_ch, ofm_dim, ofm_dim]
-    )
+    outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, ofm_ch, ofm_dim, ofm_dim])
 
     Mul_node = helper.make_node("Mul", ["inp", "mul"], ["mul_out"])
 
@@ -94,7 +90,7 @@ def test_move_mul_past_dw_conv(ifm_dim, ifm_ch, k, stride, pad_amt, dw):
         value_info=[mul, W],
     )
 
-    model = helper.make_model(graph, producer_name="mulpastconv-model")
+    model = qonnx_make_model(graph, producer_name="mulpastconv-model")
     model = ModelWrapper(model)
     inp_values = gen_finn_dt_tensor(DataType["INT2"], [1, ifm_ch, ifm_dim, ifm_dim])
     mul_values = gen_finn_dt_tensor(DataType["INT2"], [1, ifm_ch, 1, 1])
diff --git a/tests/transformation/streamline/test_move_mul_past_maxpool.py b/tests/transformation/streamline/test_move_mul_past_maxpool.py
index fcc1b65132..61dddd56e9 100755
--- a/tests/transformation/streamline/test_move_mul_past_maxpool.py
+++ b/tests/transformation/streamline/test_move_mul_past_maxpool.py
@@ -34,7 +34,7 @@
 from qonnx.custom_op.general.maxpoolnhwc import compute_pool_output_dim
 from qonnx.transformation.infer_datatypes import InferDataTypes
 from qonnx.transformation.infer_shapes import InferShapes
-from qonnx.util.basic import gen_finn_dt_tensor
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 from finn.transformation.streamline.reorder import MoveMulPastMaxPool
@@ -65,13 +65,9 @@ def test_move_mul_past_maxpool(ifm_dim, ifm_ch, k, stride, pad, cw, negative):
     ofm_dim = compute_pool_output_dim(ifm_dim, k, stride, pad)
 
     # set up onnx model
-    inp = helper.make_tensor_value_info(
-        "inp", TensorProto.FLOAT, [1, ifm_ch, ifm_dim, ifm_dim]
-    )
+    inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, ifm_ch, ifm_dim, ifm_dim])
     mul = helper.make_tensor_value_info("mul", TensorProto.FLOAT, mul_shape)
-    outp = helper.make_tensor_value_info(
-        "outp", TensorProto.FLOAT, [1, ofm_ch, ofm_dim, ofm_dim]
-    )
+    outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, ofm_ch, ofm_dim, ofm_dim])
 
     Mul_node = helper.make_node("Mul", ["inp", "mul"], ["mul_out"])
 
@@ -92,7 +88,7 @@ def test_move_mul_past_maxpool(ifm_dim, ifm_ch, k, stride, pad, cw, negative):
         value_info=[mul],
     )
 
-    model = helper.make_model(graph, producer_name="mulpastmaxpool-model")
+    model = qonnx_make_model(graph, producer_name="mulpastmaxpool-model")
     model = ModelWrapper(model)
     inp_values = gen_finn_dt_tensor(DataType["INT2"], [1, ifm_ch, ifm_dim, ifm_dim])
     mul_values = np.random.random_sample(mul_shape).astype(np.float32)
diff --git a/tests/transformation/streamline/test_move_past_fork.py b/tests/transformation/streamline/test_move_past_fork.py
index 5064fa3fca..e9433178c8 100644
--- a/tests/transformation/streamline/test_move_past_fork.py
+++ b/tests/transformation/streamline/test_move_past_fork.py
@@ -28,80 +28,109 @@
 import pytest
 
 import numpy as np
-from onnx import TensorProto, helper
+import onnx.parser as oprs
 from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.transformation.general import GiveUniqueNodeNames
 from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.util.basic import get_by_name
 
 import finn.core.onnx_exec as oxe
-from finn.transformation.streamline.reorder import MoveLinearPastFork
+from finn.transformation.streamline.reorder import (
+    MoveLinearPastFork,
+    MoveTransposePastFork,
+)
+
+
+@pytest.mark.streamline
+def test_move_past_fork_transpose():
+    shp = [1, 3, 32, 32]
+    shp_str = str(shp)
+    input = f"""
+    <
+        ir_version: 7,
+        opset_import: ["" : 9]
+    >
+    agraph (float{shp_str} in0) => (float{shp_str} out0)
+    {{
+        t0_out = Transpose<perm=[0,2,3,1]>(in0)
+        t1_out = Transpose<perm=[0,3,1,2]>(t0_out)
+        t2_out = Transpose<perm=[0,3,1,2]>(t0_out)
+        out0 = Add(t1_out, t2_out)
+    }}
+    """
+    model = oprs.parse_model(input)
+    model = ModelWrapper(model)
+    model = model.transform(InferShapes())
+    new_model = model.transform(MoveTransposePastFork())
+    new_model = new_model.transform(GiveUniqueNodeNames())
+    nodes = new_model.graph.node
+    assert oxe.compare_execution(model, new_model, {"in0": np.random.rand(*shp).astype(np.float32)})
+    assert len(nodes) == 5
+    assert not new_model.is_fork_node(get_by_name(nodes, "Transpose_0"))
 
 
 @pytest.mark.streamline
 @pytest.mark.parametrize("ch", [64, 1])
 # ifmdim
 @pytest.mark.parametrize("ifmdim", [-1, 7])
-def test_move_past_fork(ch, ifmdim):
-    # generate test vectors of correct shape
+def test_move_past_fork_linear(ch, ifmdim):
     if ifmdim == -1:
-        input_shape = (1, ch)
+        shp = [1, ch]
     else:
-        input_shape = (1, ch, ifmdim, ifmdim)
+        shp = [1, ch, ifmdim, ifmdim]
+    shp_str = str(shp)
+    input = f"""
+    <
+        ir_version: 7,
+        opset_import: ["" : 9]
+    >
+    agraph (float{shp_str} in0) => (float{shp_str} out0)
+    <
+        float{shp_str} add0_param,
+        float{shp_str} mul_shared_param,
+        float{shp_str} add2_param,
+        float{shp_str} mul2_param,
+        float{shp_str} add3_param,
+        float{shp_str} add4_param,
+        float{shp_str} mul3_param,
+        float{shp_str} add6_param
+    >
+    {{
 
-    top_in = helper.make_tensor_value_info("top_in", TensorProto.FLOAT, input_shape)
-    top_out = helper.make_tensor_value_info("top_out", TensorProto.FLOAT, input_shape)
-
-    num_of_params = 8
-    value_info = []
-    for i in range(num_of_params):
-        value_info += [
-            helper.make_tensor_value_info("p" + str(i), TensorProto.FLOAT, input_shape)
-        ]
-
-    add_1_to_move = helper.make_node("Add", ["top_in", "p0"], ["fork1"])
-    mul_1_to_move = helper.make_node("Mul", ["t5", "p4"], ["fork2"])
-    add_2_to_move = helper.make_node("Add", ["fork2", "p5"], ["t6"])
-    mul_1_not_to_move = helper.make_node("Mul", ["t8", "p7"], ["fork3"])
-    modelproto = helper.make_model(
-        helper.make_graph(
-            name="test",
-            inputs=[top_in],
-            outputs=[top_out],
-            value_info=value_info,
-            nodes=[
-                # fork1
-                add_1_to_move,
-                helper.make_node("Mul", ["fork1", "p1"], ["t2"]),
-                helper.make_node("Mul", ["fork1", "p2"], ["t3"]),
-                helper.make_node("Add", ["t2", "t3"], ["t4"]),
-                helper.make_node("Add", ["t4", "p3"], ["t5"]),
-                # fork2
-                mul_1_to_move,
-                add_2_to_move,
-                helper.make_node("Add", ["fork2", "p6"], ["t7"]),
-                helper.make_node("Add", ["t6", "t7"], ["t8"]),
-                # empty branches: do nothing
-                mul_1_not_to_move,
-                helper.make_node("Add", ["fork3", "fork3"], ["top_out"]),
-            ],
-        )
-    )
-    model = ModelWrapper(modelproto)
+        add0_out = Add(in0, add0_param)
+        mul0_out = Mul(add0_out, mul_shared_param)
+        mul1_out = Mul(add0_out, mul_shared_param)
+        add1_out = Add(mul0_out, mul1_out)
+        add2_out = Add(add1_out, add2_param)
+        mul2_out = Mul(add2_out, mul2_param)
+        add3_out = Add(mul2_out, add3_param)
+        add4_out = Add(mul2_out, add4_param)
+        add5_out = Add(add3_out, add4_out)
+        mul3_out = Mul(add5_out, mul3_param)
+        out0 = Add(mul3_out, add6_param)
+    }}
+    """
+    model = oprs.parse_model(input)
+    model = ModelWrapper(model)
     model = model.transform(InferShapes())
 
     np.random.seed(0)
-    for i in range(num_of_params):
-        model.set_initializer(
-            "p" + str(i), np.random.rand(*input_shape).astype(np.float32)
-        )
-
+    for tensor_name in model.get_all_tensor_names():
+        if tensor_name.endswith("_param"):
+            pshape = model.get_tensor_shape(tensor_name)
+            model.set_initializer(tensor_name, np.random.rand(*pshape).astype(np.float32))
+    model = model.transform(GiveUniqueNodeNames())
     # Transform
     new_model = model.transform(MoveLinearPastFork())
-    inp_dict = {"top_in": np.random.rand(*input_shape).astype(np.float32)}
-
+    new_model = new_model.transform(GiveUniqueNodeNames())
+    inp_dict = {"top_in": np.random.rand(*shp).astype(np.float32)}
     # Test
     assert oxe.compare_execution(model, new_model, inp_dict)
-    assert not new_model.is_fork_node(add_1_to_move)
-    assert not new_model.is_fork_node(mul_1_to_move)
-    assert not new_model.is_fork_node(add_2_to_move)
-    assert new_model.is_fork_node(mul_1_not_to_move)
+    nodes = new_model.graph.node
+    assert len(new_model.get_nodes_by_op_type("Add")) == 9
+    assert len(new_model.get_nodes_by_op_type("Mul")) == 5
+    assert not new_model.is_fork_node(get_by_name(nodes, "Add_0"))
+    assert new_model.is_join_node(get_by_name(nodes, "Add_2"))
+    assert not new_model.is_fork_node(get_by_name(nodes, "Mul_2"))
+    assert not new_model.is_join_node(get_by_name(nodes, "Add_5"))
     assert len(new_model.graph.node) == 14
diff --git a/tests/transformation/streamline/test_move_scalar_past_conv.py b/tests/transformation/streamline/test_move_scalar_past_conv.py
index 59b8b8f8b2..bb99fd1d8f 100644
--- a/tests/transformation/streamline/test_move_scalar_past_conv.py
+++ b/tests/transformation/streamline/test_move_scalar_past_conv.py
@@ -32,6 +32,7 @@
 from onnx import TensorProto
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.util.basic import qonnx_make_model
 
 import finn.core.onnx_exec as ox
 from finn.transformation.streamline import MoveAddPastConv, MoveScalarMulPastConv
@@ -79,7 +80,7 @@ def test_move_scalar_past_conv(test_args, padding):
     value_info += [oh.make_tensor_value_info("p2", TensorProto.FLOAT, conv_param_shape)]
     value_info += [oh.make_tensor_value_info("p3", TensorProto.FLOAT, conv_param_shape)]
 
-    modelproto = oh.make_model(
+    modelproto = qonnx_make_model(
         oh.make_graph(
             name="test",
             inputs=[top_in],
@@ -158,7 +159,7 @@ def test_move_scalar_past_conv_only_if_linear(test_args):
     value_info += [oh.make_tensor_value_info("p4", TensorProto.FLOAT, conv_param_shape)]
     value_info += [oh.make_tensor_value_info("p5", TensorProto.FLOAT, conv_param_shape)]
 
-    modelproto = oh.make_model(
+    modelproto = qonnx_make_model(
         oh.make_graph(
             name="test",
             inputs=[top_in],
diff --git a/tests/transformation/streamline/test_move_scalar_past_matmul.py b/tests/transformation/streamline/test_move_scalar_past_matmul.py
index 6fdaaadfae..e4f4357fff 100644
--- a/tests/transformation/streamline/test_move_scalar_past_matmul.py
+++ b/tests/transformation/streamline/test_move_scalar_past_matmul.py
@@ -33,6 +33,7 @@
 from onnx import TensorProto
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.util.basic import qonnx_make_model
 
 import finn.core.onnx_exec as ox
 from finn.transformation.streamline import (
@@ -47,7 +48,7 @@ def test_move_scalar_mul_past_matmul():
     mul_param = oh.make_tensor_value_info("mul_param", TensorProto.FLOAT, [1, 1])
     matmul_param = oh.make_tensor_value_info("matmul_param", TensorProto.FLOAT, [2, 2])
     top_out = oh.make_tensor_value_info("top_out", TensorProto.FLOAT, [1, 2])
-    modelproto = oh.make_model(
+    modelproto = qonnx_make_model(
         oh.make_graph(
             name="test",
             inputs=[top_in],
@@ -62,9 +63,7 @@ def test_move_scalar_mul_past_matmul():
     model = ModelWrapper(modelproto)
     model = model.transform(InferShapes())
     model.set_initializer("mul_param", np.asarray([[3]], dtype=np.float32))
-    model.set_initializer(
-        "matmul_param", np.asarray([[2, 4], [-1, 1]], dtype=np.float32)
-    )
+    model.set_initializer("matmul_param", np.asarray([[2, 4], [-1, 1]], dtype=np.float32))
     new_model = model.transform(MoveScalarMulPastMatMul())
     inp_dict = {"top_in": np.asarray([[-1.0, 1.0]], dtype=np.float32)}
     assert ox.compare_execution(model, new_model, inp_dict)
@@ -79,7 +78,7 @@ def test_move_scalar_add_past_matmul():
     add_param = oh.make_tensor_value_info("add_param", TensorProto.FLOAT, [1, 1])
     matmul_param = oh.make_tensor_value_info("matmul_param", TensorProto.FLOAT, [2, 2])
     top_out = oh.make_tensor_value_info("top_out", TensorProto.FLOAT, [1, 2])
-    modelproto = oh.make_model(
+    modelproto = qonnx_make_model(
         oh.make_graph(
             name="test",
             inputs=[top_in],
@@ -94,9 +93,7 @@ def test_move_scalar_add_past_matmul():
     model = ModelWrapper(modelproto)
     model = model.transform(InferShapes())
     model.set_initializer("add_param", np.asarray([[3]], dtype=np.float32))
-    model.set_initializer(
-        "matmul_param", np.asarray([[2, 4], [-1, 1]], dtype=np.float32)
-    )
+    model.set_initializer("matmul_param", np.asarray([[2, 4], [-1, 1]], dtype=np.float32))
     new_model = model.transform(MoveScalarAddPastMatMul())
     inp_dict = {"top_in": np.asarray([[-1.0, 1.0]], dtype=np.float32)}
     assert ox.compare_execution(model, new_model, inp_dict)
@@ -122,7 +119,7 @@ def test_move_scalar_past_matmul_only_if_linear(test_args):
     p2 = oh.make_tensor_value_info("p2", TensorProto.FLOAT, matmul_shape)
     p3 = oh.make_tensor_value_info("p3", TensorProto.FLOAT, matmul_shape)
     p4 = oh.make_tensor_value_info("p4", TensorProto.FLOAT, matmul_shape)
-    modelproto = oh.make_model(
+    modelproto = qonnx_make_model(
         oh.make_graph(
             name="test",
             inputs=[top_in],
diff --git a/tests/transformation/streamline/test_move_transpose_past_scalar_mul.py b/tests/transformation/streamline/test_move_transpose_past_scalar_mul.py
index 9662ba8a90..6bf72961ac 100644
--- a/tests/transformation/streamline/test_move_transpose_past_scalar_mul.py
+++ b/tests/transformation/streamline/test_move_transpose_past_scalar_mul.py
@@ -36,6 +36,7 @@
 from qonnx.transformation.infer_data_layouts import InferDataLayouts
 from qonnx.transformation.infer_datatypes import InferDataTypes
 from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.util.basic import qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 from finn.transformation.streamline.reorder import MoveTransposePastScalarMul
@@ -71,7 +72,7 @@ def test_move_transpose_past_scalar_mul(perm, scalar, data_layout):
         value_info=[a0],
     )
 
-    model = helper.make_model(graph, producer_name="mv_transpose_model")
+    model = qonnx_make_model(graph, producer_name="mv_transpose_model")
     model = ModelWrapper(model)
 
     # initialize values
diff --git a/tests/transformation/streamline/test_round_thresholds.py b/tests/transformation/streamline/test_round_thresholds.py
index 1ec5f02e87..85c60b37d5 100644
--- a/tests/transformation/streamline/test_round_thresholds.py
+++ b/tests/transformation/streamline/test_round_thresholds.py
@@ -32,6 +32,7 @@
 from onnx import TensorProto, helper
 from qonnx.core.datatype import DataType
 from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.util.basic import qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 from finn.transformation.streamline import RoundAndClipThresholds
@@ -46,7 +47,7 @@ def test_round_thresholds():
         "MultiThreshold", ["v", "thresholds"], ["out"], domain="qonnx.custom_op.general"
     )
     graph_def = helper.make_graph([node_def], "test_model", [v, thresholds], [out])
-    model_def = helper.make_model(graph_def)
+    model_def = qonnx_make_model(graph_def)
     model = ModelWrapper(model_def)
     threshold_val = np.asarray([[-1.1], [0.7], [2.3], [5.1]], dtype=np.float32)
     model.set_initializer("thresholds", threshold_val)
diff --git a/tests/transformation/streamline/test_scale_resize_nhwc.py b/tests/transformation/streamline/test_scale_resize_nhwc.py
new file mode 100644
index 0000000000..350f5b3133
--- /dev/null
+++ b/tests/transformation/streamline/test_scale_resize_nhwc.py
@@ -0,0 +1,271 @@
+import pytest
+
+import numpy as np
+import onnx
+import onnx.helper as oh
+import qonnx.core.data_layout as DataLayout
+from onnx import TensorProto
+from qonnx.core.datatype import DataType
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.transformation.infer_data_layouts import InferDataLayouts
+from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
+
+import finn.core.onnx_exec as oxe
+from finn.transformation.streamline.reorder import MakeScaleResizeNHWC
+
+
+def create_resize_transpose(ifm_dim, ifm_ch, scales, mode, idt):
+    ofm_dim_h = ifm_dim[0] * scales[2]
+    ofm_dim_w = ifm_dim[1] * scales[3]
+    inp = oh.make_tensor_value_info("inp", TensorProto.FLOAT, [1, ifm_ch, ifm_dim[0], ifm_dim[1]])
+
+    param = oh.make_tensor_value_info("scales", TensorProto.FLOAT, [4])
+
+    # Not actually used, only needed for compliance with the Resize node interface
+    roi = oh.make_tensor_value_info("roi", TensorProto.FLOAT, [4])
+
+    outp_up = oh.make_tensor_value_info(
+        "outp_up", TensorProto.FLOAT, [1, ifm_ch, ofm_dim_h, ofm_dim_w]
+    )
+    outp = oh.make_tensor_value_info("outp", TensorProto.FLOAT, [1, ofm_dim_h, ofm_dim_w, ifm_ch])
+
+    resize_node = oh.make_node(
+        "Resize",
+        inputs=["inp", "roi", "scales"],
+        outputs=["outp_up"],
+        name="Resize1",
+        mode=mode,
+    )
+
+    transpose_node = onnx.helper.make_node(
+        "Transpose",
+        inputs=["outp_up"],
+        outputs=["outp"],
+        name="Transpose1",
+        perm=[0, 2, 3, 1],
+    )
+
+    graph = oh.make_graph(
+        nodes=[resize_node, transpose_node],
+        name="resize_graph",
+        inputs=[inp],
+        outputs=[outp],
+        value_info=[outp_up, param, roi],
+    )
+
+    model = qonnx_make_model(graph, producer_name="resize_model1")
+    model = ModelWrapper(model)
+    model.set_tensor_datatype("inp", idt)
+    model.set_tensor_datatype("outp", idt)
+
+    model.set_tensor_layout("inp", DataLayout.NCHW)
+    model = model.transform(InferShapes())
+    model = model.transform(InferDataLayouts())
+
+    return model
+
+
+def create_transpose_resize(ifm_dim, ifm_ch, scales, mode, idt):
+    ofm_dim_h = ifm_dim[0] * scales[2]
+    ofm_dim_w = ifm_dim[1] * scales[3]
+    inp = oh.make_tensor_value_info("inp", TensorProto.FLOAT, [1, ifm_dim[0], ifm_dim[1], ifm_ch])
+
+    param = oh.make_tensor_value_info("scales", TensorProto.FLOAT, [4])
+
+    # Not actually used, only needed for compliance with the Resize node interface
+    roi = oh.make_tensor_value_info("roi", TensorProto.FLOAT, [4])
+
+    outp = oh.make_tensor_value_info("outp", TensorProto.FLOAT, [1, ifm_ch, ofm_dim_h, ofm_dim_w])
+    outp_tr = oh.make_tensor_value_info(
+        "outp_tr", TensorProto.FLOAT, [1, ifm_ch, ifm_dim[0], ifm_dim[1]]
+    )
+
+    transpose_node = onnx.helper.make_node(
+        "Transpose",
+        inputs=["inp"],
+        outputs=["outp_tr"],
+        name="Transpose1",
+        perm=[0, 3, 1, 2],
+    )
+
+    resize_node = oh.make_node(
+        "Resize",
+        inputs=["outp_tr", "roi", "scales"],
+        outputs=["outp"],
+        name="Resize1",
+        mode=mode,
+    )
+
+    graph = oh.make_graph(
+        nodes=[transpose_node, resize_node],
+        name="resize_graph",
+        inputs=[inp],
+        outputs=[outp],
+        value_info=[outp_tr, param, roi],
+    )
+
+    model = qonnx_make_model(graph, producer_name="resize_model2")
+    model = ModelWrapper(model)
+    model.set_tensor_datatype("inp", idt)
+    model.set_tensor_datatype("outp", idt)
+    model.set_tensor_layout("inp", DataLayout.NHWC)
+
+    model = model.transform(InferShapes())
+    model = model.transform(InferDataLayouts())
+
+    return model
+
+
+def create_transpose_resize_transpose(ifm_dim, ifm_ch, scales, mode, idt):
+    ofm_dim_h = ifm_dim[0] * scales[2]
+    ofm_dim_w = ifm_dim[1] * scales[3]
+    inp = oh.make_tensor_value_info("inp", TensorProto.FLOAT, [1, ifm_dim[0], ifm_dim[1], ifm_ch])
+
+    param = oh.make_tensor_value_info("scales", TensorProto.FLOAT, scales)
+
+    # Not actually used, only needed for compliance with the Resize node interface
+    roi = oh.make_tensor_value_info("roi", TensorProto.FLOAT, [4])
+
+    outp_tr = oh.make_tensor_value_info(
+        "outp_tr", TensorProto.FLOAT, [1, ifm_ch, ifm_dim[0], ifm_dim[1]]
+    )
+
+    outp_up = oh.make_tensor_value_info(
+        "outp_up", TensorProto.FLOAT, [1, ifm_ch, ofm_dim_h, ofm_dim_w]
+    )
+    outp = oh.make_tensor_value_info("outp", TensorProto.FLOAT, [1, ofm_dim_h, ofm_dim_w, ifm_ch])
+
+    transpose_node1 = onnx.helper.make_node(
+        "Transpose",
+        inputs=["inp"],
+        outputs=["outp_tr"],
+        name="Transpose1",
+        perm=[0, 3, 1, 2],
+    )
+
+    resize_node = oh.make_node(
+        "Resize",
+        inputs=["outp_tr", "roi", "scales"],
+        outputs=["outp_up"],
+        name="Resize1",
+        mode=mode,
+    )
+
+    transpose_node2 = onnx.helper.make_node(
+        "Transpose",
+        inputs=["outp_up"],
+        outputs=["outp"],
+        name="Transpose2",
+        perm=[0, 2, 3, 1],
+    )
+
+    graph = oh.make_graph(
+        nodes=[transpose_node1, resize_node, transpose_node2],
+        name="resize_graph",
+        inputs=[inp],
+        outputs=[outp],
+        value_info=[outp_up, outp_tr, param, roi],
+    )
+
+    model = qonnx_make_model(graph, producer_name="resize_model3")
+    model = ModelWrapper(model)
+    model.set_tensor_datatype("inp", idt)
+    model.set_tensor_datatype("outp", idt)
+    model.set_tensor_layout("inp", DataLayout.NHWC)
+
+    model = model.transform(InferShapes())
+    model = model.transform(InferDataLayouts())
+
+    return model
+
+
+def check_transform(model):
+    graph = model.graph
+    node_ind = 0
+    for n in graph.node:
+        node_ind += 1
+        if n.op_type == "Upsample" or n.op_type == "Resize":
+            if model.get_tensor_layout(n.output[0]) == DataLayout.NHWC:
+                return True
+    return False
+
+
+@pytest.mark.streamline
+# input dimension
+@pytest.mark.parametrize("ifm_dim", [[2**i, 2**i] for i in range(3, 6)])
+# input channels
+@pytest.mark.parametrize("ifm_ch", [3])
+# scales
+@pytest.mark.parametrize("scales", [[1, 1, i, j] for i in range(2, 5) for j in range(2, 5)])
+# mode
+@pytest.mark.parametrize("mode", ["nearest"])
+# input datatype
+@pytest.mark.parametrize("idt", [DataType["INT4"]])
+def test_scale_resize_nhwc(ifm_dim, ifm_ch, scales, mode, idt):
+    # create models
+    resize_model1 = create_resize_transpose(ifm_dim, ifm_ch, scales, mode, idt)
+    resize_model2 = create_transpose_resize(ifm_dim, ifm_ch, scales, mode, idt)
+    resize_model3 = create_transpose_resize_transpose(ifm_dim, ifm_ch, scales, mode, idt)
+
+    # set initializers
+    resize_model1.set_initializer("scales", np.array(scales, dtype=np.float32))
+    resize_model2.set_initializer("scales", np.array(scales, dtype=np.float32))
+    resize_model3.set_initializer("scales", np.array(scales, dtype=np.float32))
+
+    # generate input tensor for testing
+    input_tensor_nchw = gen_finn_dt_tensor(idt, [1, ifm_ch, ifm_dim[0], ifm_dim[1]])
+    input_tensor_nhwc = gen_finn_dt_tensor(idt, [1, ifm_dim[0], ifm_dim[1], ifm_ch])
+    input_dict_nchw = {"inp": input_tensor_nchw}
+    input_dict_nhwc = {"inp": input_tensor_nhwc}
+
+    # execute first model
+    output_dict1 = oxe.execute_onnx(resize_model1, input_dict_nchw)
+    expected1 = output_dict1["outp"]
+
+    # transform Resize into ResizeNHWC
+    resize_model1 = resize_model1.transform(MakeScaleResizeNHWC())
+    resize_model1 = resize_model1.transform(InferDataLayouts())
+
+    # execute transformed model
+    output_node_name1 = resize_model1.graph.output[0].name
+    output_dict1 = oxe.execute_onnx(resize_model1, input_dict_nchw, return_full_exec_context=False)
+    output1 = output_dict1[output_node_name1]
+
+    # compare outputs
+    assert (expected1 == output1).all()
+    assert check_transform(resize_model1)
+
+    # execute second model
+    output_dict2 = oxe.execute_onnx(resize_model2, input_dict_nhwc)
+    expected2 = output_dict2["outp"]
+
+    # transform Resize into ResizeNHWC
+    resize_model2 = resize_model2.transform(MakeScaleResizeNHWC())
+    resize_model2 = resize_model2.transform(InferDataLayouts())
+
+    # execute transformed model
+    output_node_name2 = resize_model2.graph.output[0].name
+    output_dict2 = oxe.execute_onnx(resize_model2, input_dict_nhwc, return_full_exec_context=False)
+    output2 = output_dict2[output_node_name2]
+
+    # compare outputs
+    assert (expected2 == output2).all()
+    assert check_transform(resize_model2)
+
+    # execute third model
+    output_dict3 = oxe.execute_onnx(resize_model3, input_dict_nhwc)
+    expected3 = output_dict3["outp"]
+
+    # transform Resize into ResizeNHWC
+    resize_model3 = resize_model3.transform(MakeScaleResizeNHWC())
+    resize_model3 = resize_model3.transform(InferDataLayouts())
+
+    # execute transformed model
+    output_node_name3 = resize_model3.graph.output[0].name
+    output_dict3 = oxe.execute_onnx(resize_model3, input_dict_nhwc, return_full_exec_context=False)
+    output3 = output_dict3[output_node_name3]
+
+    # compare outputs
+    assert (expected3 == output3).all()
+    assert check_transform(resize_model3)
diff --git a/tests/transformation/streamline/test_sign_to_thres.py b/tests/transformation/streamline/test_sign_to_thres.py
index 839680bd7a..1386592563 100644
--- a/tests/transformation/streamline/test_sign_to_thres.py
+++ b/tests/transformation/streamline/test_sign_to_thres.py
@@ -28,16 +28,19 @@
 
 import pytest
 
-import brevitas.onnx as bo
 import onnx
 import onnx.numpy_helper as nph
 import os
+import torch
+from brevitas.export import export_qonnx
 from pkgutil import get_data
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.transformation.fold_constants import FoldConstants
 from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.util.cleanup import cleanup as qonnx_cleanup
 
 import finn.core.onnx_exec as oxe
+from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN
 from finn.transformation.streamline import ConvertSignToThres
 from finn.util.test import get_test_model_trained
 
@@ -47,8 +50,10 @@
 @pytest.mark.streamline
 def test_sign_to_thres():
     lfc = get_test_model_trained("LFC", 1, 1)
-    bo.export_finn_onnx(lfc, (1, 1, 28, 28), export_onnx_path)
+    export_qonnx(lfc, torch.randn(1, 1, 28, 28), export_onnx_path)
+    qonnx_cleanup(export_onnx_path, out_file=export_onnx_path)
     model = ModelWrapper(export_onnx_path)
+    model = model.transform(ConvertQONNXtoFINN())
     model = model.transform(InferShapes())
     model = model.transform(FoldConstants())
     new_model = model.transform(ConvertSignToThres())
diff --git a/tests/transformation/streamline/test_streamline_cnv.py b/tests/transformation/streamline/test_streamline_cnv.py
index 6a82925012..8a91a49278 100644
--- a/tests/transformation/streamline/test_streamline_cnv.py
+++ b/tests/transformation/streamline/test_streamline_cnv.py
@@ -26,23 +26,26 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import pkg_resources as pk
-
 import pytest
 
-import brevitas.onnx as bo
+import importlib_resources as importlib
 import numpy as np
+import torch
+from brevitas.export import export_qonnx
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.transformation.fold_constants import FoldConstants
 from qonnx.transformation.general import (
     GiveReadableTensorNames,
     GiveUniqueNodeNames,
+    GiveUniqueParameterTensors,
     RemoveStaticGraphInputs,
     RemoveUnusedTensors,
 )
 from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.util.cleanup import cleanup as qonnx_cleanup
 
 import finn.core.onnx_exec as oxe
+from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN
 from finn.transformation.streamline import Streamline
 from finn.util.basic import make_build_dir
 from finn.util.test import get_test_model_trained
@@ -63,16 +66,20 @@ def test_streamline_cnv(size, wbits, abits):
     nname = "%s_%dW%dA" % (size, wbits, abits)
     finn_onnx = export_onnx_path + "/%s.onnx" % nname
     fc = get_test_model_trained(size, wbits, abits)
-    bo.export_finn_onnx(fc, (1, 3, 32, 32), finn_onnx)
+    export_qonnx(fc, torch.randn(1, 3, 32, 32), finn_onnx)
+    qonnx_cleanup(finn_onnx, out_file=finn_onnx)
     model = ModelWrapper(finn_onnx)
+    model = model.transform(ConvertQONNXtoFINN())
     model = model.transform(InferShapes())
     model = model.transform(FoldConstants())
     model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(GiveUniqueParameterTensors())
     model = model.transform(GiveReadableTensorNames())
     model = model.transform(RemoveStaticGraphInputs())
     # load one of the test vectors
-    fn = pk.resource_filename("finn.qnn-data", "cifar10/cifar10-test-data-class3.npz")
-    input_tensor = np.load(fn)["arr_0"].astype(np.float32)
+    ref = importlib.files("finn.qnn-data") / "cifar10/cifar10-test-data-class3.npz"
+    with importlib.as_file(ref) as fn:
+        input_tensor = np.load(fn)["arr_0"].astype(np.float32)
     input_tensor = input_tensor / 255
     assert input_tensor.shape == (1, 3, 32, 32)
     # run using FINN-based execution
diff --git a/tests/transformation/streamline/test_streamline_fc.py b/tests/transformation/streamline/test_streamline_fc.py
index 9000821435..edc4a96fe2 100644
--- a/tests/transformation/streamline/test_streamline_fc.py
+++ b/tests/transformation/streamline/test_streamline_fc.py
@@ -28,22 +28,26 @@
 
 import pytest
 
-import brevitas.onnx as bo
 import numpy as np
 import onnx
 import onnx.numpy_helper as nph
+import torch
+from brevitas.export import export_qonnx
 from pkgutil import get_data
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.transformation.fold_constants import FoldConstants
 from qonnx.transformation.general import (
     GiveReadableTensorNames,
     GiveUniqueNodeNames,
+    GiveUniqueParameterTensors,
     RemoveStaticGraphInputs,
     RemoveUnusedTensors,
 )
 from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.util.cleanup import cleanup as qonnx_cleanup
 
 import finn.core.onnx_exec as oxe
+from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN
 from finn.transformation.streamline import Streamline
 from finn.util.basic import make_build_dir
 from finn.util.test import get_test_model_trained
@@ -66,11 +70,14 @@ def test_streamline_fc(size, wbits, abits):
     nname = "%s_%dW%dA" % (size, wbits, abits)
     finn_onnx = export_onnx_path + "/%s.onnx" % nname
     fc = get_test_model_trained(size, wbits, abits)
-    bo.export_finn_onnx(fc, (1, 1, 28, 28), finn_onnx)
+    export_qonnx(fc, torch.randn(1, 1, 28, 28), finn_onnx)
+    qonnx_cleanup(finn_onnx, out_file=finn_onnx)
     model = ModelWrapper(finn_onnx)
+    model = model.transform(ConvertQONNXtoFINN())
     model = model.transform(InferShapes())
     model = model.transform(FoldConstants())
     model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(GiveUniqueParameterTensors())
     model = model.transform(GiveReadableTensorNames())
     model = model.transform(RemoveStaticGraphInputs())
     # load one of the test vectors
diff --git a/tests/transformation/test_batchnorm_to_affine_bnn_pynq.py b/tests/transformation/test_batchnorm_to_affine_bnn_pynq.py
index fd4e37807c..fd5033674b 100644
--- a/tests/transformation/test_batchnorm_to_affine_bnn_pynq.py
+++ b/tests/transformation/test_batchnorm_to_affine_bnn_pynq.py
@@ -26,22 +26,24 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import pkg_resources as pk
-
 import pytest
 
-import brevitas.onnx as bo
+import importlib_resources as importlib
 import numpy as np
 import onnx
 import onnx.numpy_helper as nph
 import os
+import torch
+from brevitas.export import export_qonnx
 from pkgutil import get_data
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.transformation.batchnorm_to_affine import BatchNormToAffine
 from qonnx.transformation.fold_constants import FoldConstants
 from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.util.cleanup import cleanup as qonnx_cleanup
 
 import finn.core.onnx_exec as oxe
+from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN
 from finn.util.test import get_test_model_trained
 
 export_onnx_path = "test_output_bn2affine.onnx"
@@ -50,12 +52,15 @@
 @pytest.mark.transform
 def test_batchnorm_to_affine_cnv_w1a1():
     lfc = get_test_model_trained("CNV", 1, 1)
-    bo.export_finn_onnx(lfc, (1, 3, 32, 32), export_onnx_path)
+    export_qonnx(lfc, torch.randn(1, 3, 32, 32), export_onnx_path)
+    qonnx_cleanup(export_onnx_path, out_file=export_onnx_path)
     model = ModelWrapper(export_onnx_path)
+    model = model.transform(ConvertQONNXtoFINN())
     model = model.transform(InferShapes())
     model = model.transform(FoldConstants())
-    fn = pk.resource_filename("finn.qnn-data", "cifar10/cifar10-test-data-class3.npz")
-    input_tensor = np.load(fn)["arr_0"].astype(np.float32)
+    ref = importlib.files("finn.qnn-data") / "cifar10/cifar10-test-data-class3.npz"
+    with importlib.as_file(ref) as fn:
+        input_tensor = np.load(fn)["arr_0"].astype(np.float32)
     input_tensor = input_tensor / 255
     assert input_tensor.shape == (1, 3, 32, 32)
     input_dict = {"0": input_tensor}
@@ -75,8 +80,10 @@ def test_batchnorm_to_affine_cnv_w1a1():
 @pytest.mark.transform
 def test_batchnorm_to_affine_lfc_w1a1():
     lfc = get_test_model_trained("LFC", 1, 1)
-    bo.export_finn_onnx(lfc, (1, 1, 28, 28), export_onnx_path)
+    export_qonnx(lfc, torch.randn(1, 1, 28, 28), export_onnx_path)
+    qonnx_cleanup(export_onnx_path, out_file=export_onnx_path)
     model = ModelWrapper(export_onnx_path)
+    model = model.transform(ConvertQONNXtoFINN())
     model = model.transform(InferShapes())
     model = model.transform(FoldConstants())
     new_model = model.transform(BatchNormToAffine())
diff --git a/tests/transformation/test_infer_data_layouts_cnv.py b/tests/transformation/test_infer_data_layouts_cnv.py
index 952ce306a4..25bf890271 100644
--- a/tests/transformation/test_infer_data_layouts_cnv.py
+++ b/tests/transformation/test_infer_data_layouts_cnv.py
@@ -28,19 +28,26 @@
 
 import pytest
 
-import brevitas.onnx as bo
 import os
 import qonnx.core.data_layout as DataLayout
+import torch
+from brevitas.export import export_qonnx
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.transformation.bipolar_to_xnor import ConvertBipolarMatMulToXnorPopcount
 from qonnx.transformation.fold_constants import FoldConstants
-from qonnx.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
+from qonnx.transformation.general import (
+    GiveReadableTensorNames,
+    GiveUniqueNodeNames,
+    GiveUniqueParameterTensors,
+)
 from qonnx.transformation.infer_data_layouts import InferDataLayouts
 from qonnx.transformation.infer_shapes import InferShapes
 from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul
+from qonnx.util.cleanup import cleanup as qonnx_cleanup
 
 import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
 import finn.transformation.streamline.absorb as absorb
+from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN
 from finn.transformation.streamline import Streamline
 from finn.transformation.streamline.reorder import MakeMaxPoolNHWC
 from finn.util.test import get_test_model_trained
@@ -51,11 +58,14 @@
 @pytest.mark.transform
 def test_infer_data_layouts_cnv():
     cnv = get_test_model_trained("CNV", 1, 1)
-    bo.export_finn_onnx(cnv, (1, 3, 32, 32), export_onnx_path_cnv)
+    export_qonnx(cnv, torch.randn(1, 3, 32, 32), export_onnx_path_cnv)
+    qonnx_cleanup(export_onnx_path_cnv, out_file=export_onnx_path_cnv)
     model = ModelWrapper(export_onnx_path_cnv)
+    model = model.transform(ConvertQONNXtoFINN())
     model = model.transform(InferShapes())
     model = model.transform(FoldConstants())
     model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(GiveUniqueParameterTensors())
     model = model.transform(GiveReadableTensorNames())
     model = model.transform(Streamline())
     model = model.transform(InferDataLayouts())
@@ -103,9 +113,7 @@ def test_infer_data_layouts_cnv():
     # note: im2col output isn't really NHWC or any other common layout
     # since the concept of channels changes with lowering... but it is
     # conceptually close to NHWC since the innermost dim gets multiplied
-    assert (
-        model.get_tensor_layout("ConvolutionInputGenerator_0_out0") == DataLayout.NHWC
-    )
+    assert model.get_tensor_layout("ConvolutionInputGenerator_0_out0") == DataLayout.NHWC
     assert model.get_tensor_layout("MatrixVectorActivation_3_out0") == DataLayout.NHWC
     assert model.get_tensor_layout("Reshape_0_out0") == DataLayout.NC
     assert model.get_tensor_layout("MatrixVectorActivation_6_out0") == DataLayout.NC
diff --git a/tests/transformation/test_infer_datatypes_lfc.py b/tests/transformation/test_infer_datatypes_lfc.py
index 9798005349..b9d9dc558f 100644
--- a/tests/transformation/test_infer_datatypes_lfc.py
+++ b/tests/transformation/test_infer_datatypes_lfc.py
@@ -28,15 +28,18 @@
 
 import pytest
 
-import brevitas.onnx as bo
 import os
+import torch
+from brevitas.export import export_qonnx
 from qonnx.core.datatype import DataType
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.transformation.fold_constants import FoldConstants
 from qonnx.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
 from qonnx.transformation.infer_datatypes import InferDataTypes
 from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.util.cleanup import cleanup as qonnx_cleanup
 
+from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN
 from finn.util.test import get_test_model_trained
 
 export_onnx_path = "test_infer_datatypes.onnx"
@@ -45,8 +48,10 @@
 @pytest.mark.transform
 def test_infer_datatypes_lfc():
     lfc = get_test_model_trained("LFC", 1, 1)
-    bo.export_finn_onnx(lfc, (1, 1, 28, 28), export_onnx_path)
+    export_qonnx(lfc, torch.randn(1, 1, 28, 28), export_onnx_path)
+    qonnx_cleanup(export_onnx_path, out_file=export_onnx_path)
     model = ModelWrapper(export_onnx_path)
+    model = model.transform(ConvertQONNXtoFINN())
     model = model.transform(InferShapes())
     model = model.transform(FoldConstants())
     model = model.transform(GiveUniqueNodeNames())
diff --git a/tests/transformation/test_qonnx_to_finn.py b/tests/transformation/test_qonnx_to_finn.py
index 43055f6704..939082b87b 100644
--- a/tests/transformation/test_qonnx_to_finn.py
+++ b/tests/transformation/test_qonnx_to_finn.py
@@ -27,21 +27,16 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
-import pkg_resources as pk
-
 import pytest
 
-import brevitas.export.onnx.generic as b_onnx
-import brevitas.onnx as bo
+import importlib_resources as importlib
 import numpy as np
 import onnx
 import onnx.numpy_helper as nph
 import torch
+from brevitas.export import export_qonnx
 from pkgutil import get_data
 from qonnx.core.modelwrapper import ModelWrapper
-from qonnx.transformation.fold_constants import FoldConstants
-from qonnx.transformation.general import GiveUniqueNodeNames, RemoveStaticGraphInputs
-from qonnx.transformation.infer_shapes import InferShapes
 from qonnx.util.cleanup import cleanup
 from tempfile import TemporaryDirectory
 
@@ -59,10 +54,9 @@ def get_brev_model_and_sample_inputs(model_name, wbits, abits):
         brev_model = get_test_model_trained(model_name, wbits, abits)
     elif model_name == "CNV":
         in_shape = (1, 3, 32, 32)
-        fn = pk.resource_filename(
-            "finn.qnn-data", "cifar10/cifar10-test-data-class3.npz"
-        )
-        input_tensor = np.load(fn)["arr_0"].astype(np.float32)
+        ref = importlib.files("finn.qnn-data") / "cifar10/cifar10-test-data-class3.npz"
+        with importlib.as_file(ref) as fn:
+            input_tensor = np.load(fn)["arr_0"].astype(np.float32)
         input_tensor = input_tensor / 255
         brev_model = get_test_model_trained(model_name, wbits, abits)
     elif model_name == "mobilenet":
@@ -94,6 +88,8 @@ def analysis_testing_for_no_quant_nodes(model):
 @pytest.mark.parametrize("wbits", [1, 2])
 @pytest.mark.parametrize("model_name", ["TFC", "SFC", "LFC", "CNV", "mobilenet"])
 def test_QONNX_to_FINN(model_name, wbits, abits):
+    if model_name == "mobilenet":
+        pytest.xfail("MobileNet test is temporarily excluded from QONNX testing.")
     if wbits > abits:
         pytest.skip("No wbits > abits cases at the moment")
     if model_name == "LFC" and wbits == 2 and abits == 2:
@@ -102,42 +98,17 @@ def test_QONNX_to_FINN(model_name, wbits, abits):
         pytest.skip("Mobilenet only runs at W2A2, though it's technically W4A4.")
 
     # Get test config and model
-    ATOL = 1e-7
-    brev_model, in_shape, input_tensor = get_brev_model_and_sample_inputs(
-        model_name, wbits, abits
-    )
+    ATOL = 1e-6
+    brev_model, in_shape, input_tensor = get_brev_model_and_sample_inputs(model_name, wbits, abits)
     temp_dir = TemporaryDirectory()
     qonnx_base_path = temp_dir.name + "/qonnx_{}.onnx"
-    finn_base_path = temp_dir.name + "/finn_{}.onnx"
 
     # Get Brevitas output
     torch_input_tensor = torch.from_numpy(input_tensor).float()
     brev_output = brev_model.forward(torch_input_tensor).detach().numpy()
 
-    # Get "clean" FINN model and it's output
-    _ = bo.export_finn_onnx(brev_model, in_shape, finn_base_path.format("raw"))
-    model = ModelWrapper(finn_base_path.format("raw"))
-    model = model.transform(GiveUniqueNodeNames())
-    model = model.transform(InferShapes())
-    model = model.transform(FoldConstants())
-    model = model.transform(RemoveStaticGraphInputs())
-    model.save(finn_base_path.format("clean"))
-
-    model = ModelWrapper(finn_base_path.format("clean"))
-    input_dict = {model.graph.input[0].name: input_tensor}
-    output_dict = oxe.execute_onnx(model, input_dict, False)
-    finn_export_output = output_dict[model.graph.output[0].name]
-    # This test always fails on MobileNet for some reason
-    if model_name != "mobilenet":
-        assert np.isclose(
-            brev_output, finn_export_output, atol=ATOL
-        ).all(), "The output of the Brevitas model and the FINN model should match."
-
-    # Get the equivalent QONNX model
-    b_onnx.function.DOMAIN_STRING = "qonnx.custom_op.general"
-    _ = b_onnx.manager.BrevitasONNXManager.export(
-        brev_model, in_shape, qonnx_base_path.format("raw")
-    )
+    # Get QONNX model
+    _ = export_qonnx(brev_model, torch.randn(in_shape), qonnx_base_path.format("raw"))
     cleanup(qonnx_base_path.format("raw"), out_file=qonnx_base_path.format("clean"))
 
     # Compare output
@@ -148,11 +119,6 @@ def test_QONNX_to_FINN(model_name, wbits, abits):
     assert np.isclose(
         brev_output, qonnx_export_output, atol=ATOL
     ).all(), "The output of the Brevitas model and the QONNX model should match."
-    # This test always fails on MobileNet for some reason
-    if model_name != "mobilenet":
-        assert np.isclose(
-            qonnx_export_output, finn_export_output, atol=ATOL
-        ).all(), "The output of the FINN model and the QONNX model should match."
 
     # Run QONNX to FINN conversion
     model = ModelWrapper(qonnx_base_path.format("clean"))
@@ -164,9 +130,8 @@ def test_QONNX_to_FINN(model_name, wbits, abits):
     input_dict = {model.graph.input[0].name: input_tensor}
     output_dict = oxe.execute_onnx(model, input_dict, False)
     test_output = output_dict[model.graph.output[0].name]
-    assert np.isclose(test_output, finn_export_output, atol=ATOL).all(), (
-        "The output of the FINN model "
-        "and the QONNX -> FINN converted model should match."
+    assert np.isclose(test_output, qonnx_export_output, atol=ATOL).all(), (
+        "The output of the FINN model " "and the QONNX -> FINN converted model should match."
     )
 
     # Run analysis passes on the converted model
diff --git a/tests/util/test_build_dataflow.py b/tests/util/test_build_dataflow.py
index cdf69aebdd..3649d6709e 100644
--- a/tests/util/test_build_dataflow.py
+++ b/tests/util/test_build_dataflow.py
@@ -26,10 +26,9 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import pkg_resources as pk
-
 import pytest
 
+import numpy as np
 import os
 from shutil import copytree
 
@@ -43,7 +42,7 @@
 def test_end2end_build_dataflow_directory():
     test_dir = make_build_dir("test_build_dataflow_directory_")
     target_dir = test_dir + "/build_dataflow"
-    example_data_dir = pk.resource_filename("finn.qnn-data", "build_dataflow/")
+    example_data_dir = os.environ["FINN_ROOT"] + "/src/finn/qnn-data/build_dataflow"
     copytree(example_data_dir, target_dir)
     build_dataflow_directory(target_dir)
     # check the generated files
@@ -55,11 +54,8 @@ def test_end2end_build_dataflow_directory():
     assert os.path.isfile(output_dir + "/driver/driver.py")
     assert os.path.isfile(output_dir + "/report/estimate_layer_cycles.json")
     assert os.path.isfile(output_dir + "/report/estimate_layer_resources.json")
-    assert os.path.isfile(output_dir + "/report/verify_rtlsim.vcd")
     assert os.path.isfile(output_dir + "/report/rtlsim_perf_batch_1.vcd")
-    assert os.path.isfile(
-        output_dir + "/report/estimate_layer_config_alternatives.json"
-    )
+    assert os.path.isfile(output_dir + "/report/estimate_layer_config_alternatives.json")
     assert os.path.isfile(output_dir + "/report/estimate_network_performance.json")
     assert os.path.isfile(output_dir + "/report/ooc_synth_and_timing.json")
     assert os.path.isfile(output_dir + "/report/rtlsim_performance.json")
@@ -68,8 +64,11 @@ def test_end2end_build_dataflow_directory():
     assert os.path.isfile(output_dir + "/report/post_synth_resources.xml")
     assert os.path.isfile(output_dir + "/report/post_route_timing.rpt")
     # verification outputs
-    verify_out_dir = output_dir + "/verification_output"
-    assert os.path.isfile(verify_out_dir + "/verify_initial_python_SUCCESS.npy")
-    assert os.path.isfile(verify_out_dir + "/verify_streamlined_python_SUCCESS.npy")
-    assert os.path.isfile(verify_out_dir + "/verify_folded_hls_cppsim_SUCCESS.npy")
-    assert os.path.isfile(verify_out_dir + "/verify_stitched_ip_rtlsim_SUCCESS.npy")
+    verif_batchsize = np.load(target_dir + "/input.npy").shape[0]
+    for i in range(verif_batchsize):
+        verify_out_dir = output_dir + "/verification_output"
+        assert os.path.isfile(verify_out_dir + f"/verify_initial_python_{i}_SUCCESS.npy")
+        assert os.path.isfile(verify_out_dir + f"/verify_streamlined_python_{i}_SUCCESS.npy")
+        assert os.path.isfile(verify_out_dir + f"/verify_folded_hls_cppsim_{i}_SUCCESS.npy")
+        assert os.path.isfile(verify_out_dir + f"/verify_stitched_ip_rtlsim_{i}_SUCCESS.npy")
+        assert os.path.isfile(output_dir + f"/report/verify_rtlsim_{i}.vcd")
diff --git a/tests/util/test_create.py b/tests/util/test_create.py
index dc44e4bd45..b8b439cf18 100644
--- a/tests/util/test_create.py
+++ b/tests/util/test_create.py
@@ -34,9 +34,7 @@
 
 
 @pytest.mark.util
-@pytest.mark.parametrize(
-    "bitwidth", [DataType["BIPOLAR"], DataType["INT2"], DataType["INT4"]]
-)
+@pytest.mark.parametrize("bitwidth", [DataType["BIPOLAR"], DataType["INT2"], DataType["INT4"]])
 def test_hls_random_mlp_maker(bitwidth):
     w = bitwidth
     a = bitwidth
diff --git a/tests/util/test_data_packing_hls.py b/tests/util/test_data_packing_hls.py
index 859b926543..b95bcd5d42 100644
--- a/tests/util/test_data_packing_hls.py
+++ b/tests/util/test_data_packing_hls.py
@@ -105,16 +105,12 @@ def test_npy2apintstream(test_shape, dtype):
     )
     with open(test_dir + "/compile.sh", "w") as f:
         f.write(cmd_compile)
-    compile = subprocess.Popen(
-        ["sh", "compile.sh"], stdout=subprocess.PIPE, cwd=test_dir
-    )
+    compile = subprocess.Popen(["sh", "compile.sh"], stdout=subprocess.PIPE, cwd=test_dir)
     (stdout, stderr) = compile.communicate()
     # make copy before saving the array
     ndarray = ndarray.copy()
     np.save(npy_in, ndarray)
-    execute = subprocess.Popen(
-        "./test_npy2apintstream", stdout=subprocess.PIPE, cwd=test_dir
-    )
+    execute = subprocess.Popen("./test_npy2apintstream", stdout=subprocess.PIPE, cwd=test_dir)
     (stdout, stderr) = execute.communicate()
     produced = np.load(npy_out)
     success = (produced == ndarray).all()
diff --git a/tutorials/fpga_flow/README.md b/tutorials/fpga_flow/README.md
index 63ca6ac832..2aaad0423b 100644
--- a/tutorials/fpga_flow/README.md
+++ b/tutorials/fpga_flow/README.md
@@ -4,7 +4,7 @@ This example demonstrates how to bring a FINN compiled model into the Vivado FPG
 
 If you are new to the command-line flow, more information can be found [here](https://finn.readthedocs.io/en/latest/command_line.html).
 
-This demo was created using Vivado 2020.1.
+This demo was created using Vivado 2022.1.
 
 ## Compiling the Model in FINN
 
@@ -26,7 +26,7 @@ Prior to running, insure the following prerequisites have been met:
 - Install FINN and prerequisites.  The [Getting Started](https://finn.readthedocs.io/en/latest/getting_started.html#quickstart) section of the FINN documentation might be helpful for this.
 - Ensure you have the `FINN_XILINX_PATH` and `FINN_XILINX_VERSION` env variables set appropriately for your install.  For example:
 > export FINN_XILINX_PATH=/opt/Xilinx
-> export FINN_XILINX_VERSION=2020.1
+> export FINN_XILINX_VERSION=2022.1
 - Set the env variable for your `finn` install top directory (where you cloned the FINN compiler repo):
 > export FINN_ROOT=/home/foo/finn
 
@@ -112,7 +112,7 @@ testbench generators.
 
 There are any number of ways to bring the stitched IP into larger design.
 
-FINN already packages the stitched IP block design as a standalone IP-XACT component, which you can find under `${FINN_ROOT}/tutorials/fpga_flow/output_tfc_w0a1_fpga/stitched_ip/ip`. You can add this to the list of IP repos and use it in your own Vivado designs. A good reference for this is [UG1119](https://www.xilinx.com/support/documentation/sw_manuals/xilinx2020_1/ug1119-vivado-creating-packaging-ip-tutorial.pdf)
+FINN already packages the stitched IP block design as a standalone IP-XACT component, which you can find under `${FINN_ROOT}/tutorials/fpga_flow/output_tfc_w0a1_fpga/stitched_ip/ip`. You can add this to the list of IP repos and use it in your own Vivado designs. A good reference for this is [UG1119](https://www.xilinx.com/content/dam/xilinx/support/documents/sw_manuals/xilinx2022_1/ug1119-vivado-creating-packaging-ip-tutorial.pdf)
 
 Keep in mind that all of the User IP Repo's included in the Stitched IP project (from `$FINN_HOST_BUILD_DIR` which is normally located under `/tmp/finn_dev_<username>`) need to also be brought in as IP Repo's to any project using the stitched IP.  It would be prudent to copy those IP repos to an appropriate archive location. You should also set the
 `FINN_ROOT` environment variable to point to the compiler installation directory, as some of the build scripts will
diff --git a/tutorials/fpga_flow/gen_tb_data.py b/tutorials/fpga_flow/gen_tb_data.py
index a525d92bfc..e73fd65094 100755
--- a/tutorials/fpga_flow/gen_tb_data.py
+++ b/tutorials/fpga_flow/gen_tb_data.py
@@ -48,9 +48,7 @@
                 tb_data.write("{:02X}".format(test_x[i][j][k]))
             tb_data.write("\n")
         tb_data.write(
-            "ffffffffffffffffffffffffffffffffffffffffffffffffffffff{:02X}\n".format(
-                test_y[i]
-            )
+            "ffffffffffffffffffffffffffffffffffffffffffffffffffffff{:02X}\n".format(test_y[i])
         )
 
 print("Testbench data generated at " + file_name)