From c7ac85be0dc501f63e6b1329a84a62879af1dfb1 Mon Sep 17 00:00:00 2001 From: Giuseppe Di Guglielmo Date: Sat, 7 Dec 2024 10:48:19 -0800 Subject: [PATCH] Func correct implementation of pooling layers To avoid overflow on accumulators use output types instead of inputs types. --- .../catapult/nnet_utils/nnet_pooling.h | 167 +++++++----------- 1 file changed, 64 insertions(+), 103 deletions(-) diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_pooling.h b/hls4ml/templates/catapult/nnet_utils/nnet_pooling.h index d6ab38a960..476fa99e16 100644 --- a/hls4ml/templates/catapult/nnet_utils/nnet_pooling.h +++ b/hls4ml/templates/catapult/nnet_utils/nnet_pooling.h @@ -1,13 +1,14 @@ #ifndef NNET_POOLING_H_ #define NNET_POOLING_H_ +#include "nnet_common.h" #include "nnet_helpers.h" #include namespace nnet { // Return the maximum value from an array -template T max(T x[N]) { +template accum_t max(T x[N]) { T y = x[0]; for (int i = 1; i < N; i++) { y = x[i] > y ? x[i] : y; @@ -15,52 +16,32 @@ template T max(T x[N]) { return y; } -template ac_int avg(ac_int (&x)[N]) { - // Use a wider accumulator than the input to avoid overflow - ac_int tmp = 0; - for (int i = 0; i < N; i++) { - tmp += x[i]; - } - tmp /= N; - // Now cast back to original type - ac_int y = tmp; - return tmp; -} - -template ac_fixed avg(ac_fixed (&x)[N]) { - // Use a wider accumulator than the input to avoid overflow - ac_fixed tmp = 0; - for (int i = 0; i < N; i++) { - tmp += x[i]; - } - tmp /= N; - // Now cast back to original type - ac_fixed y = tmp; - return y; -} - // Return the mean value of an array -template T avg(T (&x)[N]) { - T y = 0; +template accum_t avg(T (&x)[N], unsigned length) { + accum_t y = 0; for (int i = 0; i < N; i++) { y += x[i]; } - y /= N; + y /= length; return y; } // Enumeration for pooling operation (max, avg, l2norm pooling) enum Pool_Op { Max, Average }; // L2Norm }; -template T pool_op(T (&x)[N]) { +template accum_t pool_op(T (&x)[N], unsigned length) { switch (op) { case Max: - return max(x); + return max(x); case Average: - return avg(x); + return avg(x, length); // case L2Norm: return l2norm(x); } } +template accum_t pool_op(T (&x)[N]) { + return pool_op(x, N); +} + template T pad_val() { /*--- *- In Tensorflow, pooling ignores the value in the padded cells @@ -100,13 +81,14 @@ template constexpr int pool_op_limit_1d() { template void pooling1d_cl(data_T data[CONFIG_T::n_in * CONFIG_T::n_filt], res_T res[CONFIG_T::n_out * CONFIG_T::n_filt]) { - constexpr int ce_reuse_factor = CONFIG_T::reuse_factor; - (void)ce_reuse_factor; //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor // TODO partition the arrays according to the reuse factor const int limit = pool_op_limit_1d(); - #pragma HLS ALLOCATION function instances=CONFIG_T::pool_op limit=limit + //#pragma HLS ALLOCATION function instances=pool_op limit=limit + // Add any necessary padding + // Add padding and reduce input width to area covered by pooling function static constexpr int full_padded_width = CONFIG_T::n_in + CONFIG_T::pad_left + CONFIG_T::pad_right; static constexpr int restricted_padded_width = full_padded_width / CONFIG_T::stride_width * CONFIG_T::stride_width; @@ -114,41 +96,28 @@ void pooling1d_cl(data_T data[CONFIG_T::n_in * CONFIG_T::n_filt], res_T res[CONF for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { // Loop over input image x in steps of stride for (int ii = 0; ii < restricted_padded_width; ii += CONFIG_T::stride_width) { + unsigned overlap_pixel = 0; data_T pool[CONFIG_T::pool_width]; - #pragma HLS ARRAY_PARTITION variable=pool complete dim=0 - // Keep track of number of pixels in image vs padding region - unsigned img_overlap = 0; - // Loop over pool window x - for (int jj = 0; jj < CONFIG_T::stride_width; jj++) { - if (ii + jj < CONFIG_T::pad_left || ii + jj >= (full_padded_width - CONFIG_T::pad_right)) { - // Add padding - pool[jj] = pad_val(); - if (CONFIG_T::count_pad) { - img_overlap++; - } - } else { + //#pragma HLS ARRAY_PARTITION variable=pool complete dim=0 + + for (int jj = 0; jj < CONFIG_T::pool_width; jj++) { + if (ii + jj >= CONFIG_T::pad_left && ii + jj < CONFIG_T::n_in + CONFIG_T::pad_left) { pool[jj] = data[(ii + jj - CONFIG_T::pad_left) * CONFIG_T::n_filt + ff]; - img_overlap++; - } + overlap_pixel++; + } else + pool[jj] = pad_val(); } - // do the pooling - // TODO in the case of average pooling, need to reduce width to area of pool window - // not overlapping padding region + + int patch_size = CONFIG_T::count_pad ? CONFIG_T::stride_width : overlap_pixel; + res[(ii / CONFIG_T::stride_width) * CONFIG_T::n_filt + ff] = - pool_op(pool); - // If the pool op is Average, the zero-padding needs to be removed from the results - if (CONFIG_T::pool_op == Average) { - data_T rescale = static_cast(CONFIG_T::pool_width) / img_overlap; - res[(ii / CONFIG_T::stride_width) * CONFIG_T::n_filt + ff] *= rescale; - } + pool_op(pool, patch_size); } } } template void global_pooling1d_cl(data_T data[CONFIG_T::n_in * CONFIG_T::n_filt], res_T res[CONFIG_T::n_filt]) { - constexpr int ce_reuse_factor = CONFIG_T::reuse_factor; - (void)ce_reuse_factor; //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); @@ -156,16 +125,17 @@ void global_pooling1d_cl(data_T data[CONFIG_T::n_in * CONFIG_T::n_filt], res_T r // TODO partition the arrays according to the reuse factor const int limit = pool_op_limit_1d(); - #pragma HLS ALLOCATION function instances=CONFIG_T::pool_op limit=limit + //#pragma HLS ALLOCATION function instances=pool_op limit=limit for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { data_T pool[CONFIG_T::n_in]; - #pragma HLS ARRAY_PARTITION variable=pool complete dim=0 + //#pragma HLS ARRAY_PARTITION variable=pool complete dim=0 for (int jj = 0; jj < CONFIG_T::n_in; jj++) { pool[jj] = data[jj * CONFIG_T::n_filt + ff]; } // do the pooling - res[ff] = pool_op(pool); + res[ff] = pool_op(pool); } } @@ -196,19 +166,18 @@ struct pooling2d_config { }; template constexpr int pool_op_limit() { - return (CONFIG_T::out_height * CONFIG_T::out_width) * CONFIG_T::n_filt / CONFIG_T::reuse_factor; + return DIV_ROUNDUP((CONFIG_T::out_height * CONFIG_T::out_width) * CONFIG_T::n_filt, CONFIG_T::reuse_factor); } template void pooling2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_filt], res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt]) { - constexpr int ce_reuse_factor = CONFIG_T::reuse_factor; - (void)ce_reuse_factor; //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor // TODO partition the arrays according to the reuse factor const int limit = pool_op_limit(); - #pragma HLS ALLOCATION function instances=CONFIG_T::pool_op limit=limit + //#pragma HLS ALLOCATION function instances=pool_op limit=limit // Add padding and reduce input width to area covered by pooling function static constexpr int full_padded_width = CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right; static constexpr int full_padded_height = CONFIG_T::in_height + CONFIG_T::pad_top + CONFIG_T::pad_bottom; @@ -221,41 +190,34 @@ void pooling2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_ // Loop over input image x in steps of stride for (int jj = 0; jj < restricted_padded_width; jj += CONFIG_T::stride_width) { data_T pool[CONFIG_T::pool_height * CONFIG_T::pool_width]; - #pragma HLS ARRAY_PARTITION variable=pool complete dim=0 - // Keep track of number of pixels in image vs padding region - unsigned img_overlap = 0; + //#pragma HLS ARRAY_PARTITION variable=pool complete dim=0 + + unsigned overlap_pixel = 0; + // Loop over pool window y for (int kk = 0; kk < CONFIG_T::stride_height; kk++) { // Loop over pool window x for (int ll = 0; ll < CONFIG_T::stride_width; ll++) { - if (ii + kk < CONFIG_T::pad_top || ii + kk >= (full_padded_height - CONFIG_T::pad_bottom) || - jj + ll < CONFIG_T::pad_left || jj + ll >= (full_padded_width - CONFIG_T::pad_right)) { - // Add padding + bool cond1 = ii + kk >= CONFIG_T::pad_top && ii + kk < CONFIG_T::in_height + CONFIG_T::pad_top; + bool cond2 = jj + ll >= CONFIG_T::pad_left && jj + ll < CONFIG_T::in_width + CONFIG_T::pad_left; + if (cond1 && cond2) { + unsigned data_idx = + ((ii + kk - CONFIG_T::pad_top) * CONFIG_T::in_width + (jj + ll - CONFIG_T::pad_left)) * + CONFIG_T::n_filt + + ff; + pool[kk * CONFIG_T::stride_width + ll] = data[data_idx]; + overlap_pixel++; + } else pool[kk * CONFIG_T::stride_width + ll] = pad_val(); - if (CONFIG_T::count_pad) { - img_overlap++; - } - } else { - pool[kk * CONFIG_T::stride_width + ll] = - data[(ii + kk - CONFIG_T::pad_top) * CONFIG_T::in_width * CONFIG_T::n_filt + - (jj + ll - CONFIG_T::pad_left) * CONFIG_T::n_filt + ff]; - img_overlap++; - } } } - // do the pooling - // TODO in the case of average pooling, need to reduce height * width to area of pool window - // not overlapping padding region + + int patch_size = CONFIG_T::count_pad ? CONFIG_T::stride_width * CONFIG_T::stride_height : overlap_pixel; + res[(ii / CONFIG_T::stride_height) * CONFIG_T::out_width * CONFIG_T::n_filt + (jj / CONFIG_T::stride_width) * CONFIG_T::n_filt + ff] = - pool_op(pool); - // If the pool op is Average, the zero-padding needs to be removed from the results - if (CONFIG_T::pool_op == Average) { - data_T rescale = - static_cast(CONFIG_T::pool_height) * static_cast(CONFIG_T::pool_width) / img_overlap; - res[(ii / CONFIG_T::stride_height) * CONFIG_T::out_width * CONFIG_T::n_filt + - (jj / CONFIG_T::stride_width) * CONFIG_T::n_filt + ff] *= rescale; - } + pool_op(pool, patch_size); } } } @@ -264,13 +226,12 @@ void pooling2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_ template void pooling2d_cf(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_filt], res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt]) { - constexpr int ce_reuse_factor = CONFIG_T::reuse_factor; - (void)ce_reuse_factor; //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor // TODO partition the arrays according to the reuse factor const int limit = pool_op_limit(); - #pragma HLS ALLOCATION function instances=CONFIG_T::pool_op limit=limit + //#pragma HLS ALLOCATION function instances=pool_op limit=limit // Add padding and reduce input width to area covered by pooling function static constexpr int full_padded_width = CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right; static constexpr int full_padded_height = CONFIG_T::in_height + CONFIG_T::pad_top + CONFIG_T::pad_bottom; @@ -283,7 +244,7 @@ void pooling2d_cf(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_ // Loop over input image x in steps of stride for (int jj = 0; jj < restricted_padded_width; jj += CONFIG_T::stride_width) { data_T pool[CONFIG_T::pool_height * CONFIG_T::pool_width]; - #pragma HLS ARRAY_PARTITION variable=pool complete dim=0 + //#pragma HLS ARRAY_PARTITION variable=pool complete dim=0 // Keep track of number of pixels in image vs padding region unsigned img_overlap = 0; // Loop over pool window y @@ -294,9 +255,8 @@ void pooling2d_cf(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_ jj + ll < CONFIG_T::pad_left || jj + ll >= (full_padded_width - CONFIG_T::pad_right)) { // Add padding pool[kk * CONFIG_T::stride_width + ll] = pad_val(); - if (CONFIG_T::count_pad) { + if (CONFIG_T::count_pad) img_overlap++; - } } else { pool[kk * CONFIG_T::stride_width + ll] = data[(ii + kk - CONFIG_T::pad_top) * CONFIG_T::in_width + @@ -310,7 +270,8 @@ void pooling2d_cf(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_ // not overlapping padding region res[(ii / CONFIG_T::stride_height) * CONFIG_T::out_width + (jj / CONFIG_T::stride_width) + ff * CONFIG_T::out_height * CONFIG_T::out_width] = - pool_op(pool); + pool_op(pool); // If the pool op is Average, the zero-padding needs to be removed from the results if (CONFIG_T::pool_op == Average) { data_T rescale = @@ -331,12 +292,11 @@ void global_pooling2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * assert(CONFIG_T::pool_width == CONFIG_T::stride_width); assert(CONFIG_T::pool_height == CONFIG_T::stride_height); - constexpr int ce_reuse_factor = CONFIG_T::reuse_factor; - (void)ce_reuse_factor; //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor const int limit = pool_op_limit(); - #pragma HLS ALLOCATION instances=pool_op limit=limit function + //#pragma HLS ALLOCATION function instances=pool_op limit=limit FiltLoop: for (int filt = 0; filt < CONFIG_T::n_filt; filt++) { @@ -347,7 +307,8 @@ void global_pooling2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * pool[i] = data[i * CONFIG_T::n_filt + filt]; } - res[filt] = static_cast(pool_op(pool)); + res[filt] = static_cast( + pool_op(pool)); } }