From f98d9d9ef60082f0ecb756e57279de058b9433f4 Mon Sep 17 00:00:00 2001 From: Anirudh0707 Date: Sun, 29 Nov 2020 17:56:16 +0530 Subject: [PATCH] Change short sequence threshold for parallel versions --- c_reference/include/conv1d.h | 30 ++++++++++++++++------------- c_reference/src/conv1d.c | 37 ++++++++++++++++++++++-------------- 2 files changed, 40 insertions(+), 27 deletions(-) diff --git a/c_reference/include/conv1d.h b/c_reference/include/conv1d.h index b5f295ecf..a7ed49315 100644 --- a/c_reference/include/conv1d.h +++ b/c_reference/include/conv1d.h @@ -8,7 +8,7 @@ NOTES for the conv layers -> The conv1d & conv1d_lr layers work for all cases and can be used unconstrained. - There are no hard constraints for the parallel version, but a points regarding the optimal usage are given below + There are no hard constraints for the parallel version, but a few points regarding its optimal usage are given below -> Dilation = 1 (no dilation) for all cases -> For the non-depthwise cases, store the matrices as described below. Permutation might be necessary -> The low-rank decomposition cannot be applied to the depthwise weight matrices. This is due to the out_channels/in_channels = 0 constarint imposed by the depthwise convolution. @@ -22,10 +22,10 @@ Important points regarding parallel versions -> Due to the above reason, the parallel layers is only recommended for large in_time inputs - This should typically be for in_time (without the padding) > 2 * (kernel_size + stride). Else there would not be enough time-steps to efficiently parallelize - For other shorter input cases, the code will skip the MatMul computation and use MatVec instead (but the MatMul-variable computation overhead would remain) - For such cases, the MatVec code (conv1d and conv1d_lr) would work more efficiently - The RAM usage would be lower and the function would not have any overheads (calculation of the iterators and MatMul-auxiliary variables) + This should typically be for in_time (without the padding) > 2 * num_steps_one_row + stride. Else there would not be enough time-steps to efficiently parallelise + We need at least 2 rows for a good a MatMul performace. In the worst case the starting time step would be (stride - 1). Hence we choose 2 * num_steps_one_row + stride as the threshold + For the short input cases, the code will skip the MatMul computation and use MatVec instead (but the MatMul-variable computation overhead would remain) + For such cases, the MatVec code (conv1d and conv1d_lr) would work more efficiently due to the lower RAM usage and lack of any major overheads -> There is no support for depthwise for conv1d_parallel The regular convolution acts on all the channels while the depthwise acts only on one channel at a time This results in a non-contiguos memory access. MatMul would need to process multiple such time-steps, while the MatVec would only need to process one @@ -66,8 +66,9 @@ typedef struct ConvLayers_Params { * 2: tanh * 3: relu */ -int conv1d(float* output_signal, unsigned out_time, unsigned out_channels, const float* input_signal, - unsigned in_time, unsigned in_channels, unsigned padding, unsigned kernel_size, +int conv1d(float* output_signal, unsigned out_time, unsigned out_channels, + const float* input_signal, unsigned in_time, unsigned in_channels, + unsigned padding, unsigned kernel_size, const void* params, unsigned stride, unsigned activation); /** @@ -102,8 +103,9 @@ typedef struct ConvLayers_Parallel_Params { * 2: tanh * 3: relu */ -int conv1d_parallel(float* output_signal, unsigned out_time, unsigned out_channels, const float* input_signal, - unsigned in_time, unsigned in_channels, unsigned padding, unsigned kernel_size, +int conv1d_parallel(float* output_signal, unsigned out_time, unsigned out_channels, + const float* input_signal, unsigned in_time, unsigned in_channels, + unsigned padding, unsigned kernel_size, const void* params, unsigned stride, unsigned activation); /** @@ -141,8 +143,9 @@ typedef struct ConvLayers_LR_Params { * 2: tanh * 3: relu */ -int conv1d_lr(float* output_signal, unsigned out_time, unsigned out_channels, const float* input_signal, - unsigned in_time, unsigned in_channels, unsigned padding, unsigned kernel_size, +int conv1d_lr(float* output_signal, unsigned out_time, unsigned out_channels, + const float* input_signal, unsigned in_time, unsigned in_channels, + unsigned padding, unsigned kernel_size, const void* params, unsigned stride, unsigned activation); /** @@ -184,8 +187,9 @@ typedef struct ConvLayers_LR_Parallel_Params { * 2: tanh * 3: relu */ -int conv1d_lr_parallel(float* output_signal, unsigned out_time, unsigned out_channels, const float* input_signal, - unsigned in_time, unsigned in_channels, unsigned padding, unsigned kernel_size, +int conv1d_lr_parallel(float* output_signal, unsigned out_time, unsigned out_channels, + const float* input_signal, unsigned in_time, unsigned in_channels, + unsigned padding, unsigned kernel_size, const void* params, unsigned stride, unsigned activation); // Auxiliary Layers diff --git a/c_reference/src/conv1d.c b/c_reference/src/conv1d.c index 19bd73bdd..2ab5b7f30 100644 --- a/c_reference/src/conv1d.c +++ b/c_reference/src/conv1d.c @@ -7,8 +7,9 @@ #include "conv1d.h" #include "utils.h" -int conv1d_lr(float* output_signal, unsigned out_time, unsigned out_channels, const float* input_signal, - unsigned in_time, unsigned in_channels, unsigned padding, unsigned kernel_size, +int conv1d_lr(float* output_signal, unsigned out_time, unsigned out_channels, + const float* input_signal, unsigned in_time, unsigned in_channels, + unsigned padding, unsigned kernel_size, const void* params, unsigned stride, unsigned activation) { const ConvLayers_LR_Params* tparams= (ConvLayers_LR_Params*) params; @@ -96,8 +97,9 @@ int conv1d_lr(float* output_signal, unsigned out_time, unsigned out_channels, co return 0; } -int conv1d_lr_parallel(float* output_signal, unsigned out_time, unsigned out_channels, const float* input_signal, - unsigned in_time, unsigned in_channels, unsigned padding, unsigned kernel_size, +int conv1d_lr_parallel(float* output_signal, unsigned out_time, unsigned out_channels, + const float* input_signal, unsigned in_time, unsigned in_channels, + unsigned padding, unsigned kernel_size, const void* params, unsigned stride, unsigned activation) { unsigned ncols = kernel_size * in_channels, num_iter = 0, num_steps_one_row = 0; @@ -112,7 +114,10 @@ int conv1d_lr_parallel(float* output_signal, unsigned out_time, unsigned out_cha // Perform the convolution. Zero-pad is from 0 to padding and in_time + padding to in_time + 2 * padding // Buffer to hold the output. For corner cases, this will be realtively big. // But will be needed for the central condition (filter inside input). - unsigned buffer_steps = in_time / num_steps_one_row, rank = tparams->rank; + // If there are not enough time steps to linearise into one row, then allocate only 1 time step + unsigned buffer_steps = ((in_time / num_steps_one_row) > 1) ? + in_time / num_steps_one_row : 1; + unsigned rank = tparams->rank; // Buffer for W2 out float* temp_rank_out = (float*)malloc(buffer_steps * rank * sizeof(float)); // Buffer for W1 out @@ -147,9 +152,9 @@ int conv1d_lr_parallel(float* output_signal, unsigned out_time, unsigned out_cha // Hence we use the num_steps_one_row for calculating the number of time steps to be linearized in one row // Using the above logic, we can convert the MatVec opeartion into a MatMul operation // Ideally both implementation would be the same. However for edge devices the matMul was found to be faster matVec (both tilied) - // Skip if atleast 2 rows cannot be formed. The condition 2 * (kernel_size + stride) is the worst case criteria to form 2 rows + // Skip if atleast 2 rows cannot be formed. The condition 2 * num_steps_one_row + stride is the worst case criteria // The MatVec will be used for the computation in-case the following block is skipped - if (in_time > ((kernel_size + stride) << 1)) { + if (in_time > ((num_steps_one_row << 1) + stride)) { t_in_start -= padding; // remove the padding offset temporarily t_in_end -= padding; // Used to keep track of the final processed index for (unsigned iter = 0; (iter < num_iter) && (t_out < out_channels); @@ -251,8 +256,9 @@ int conv1d_lr_parallel(float* output_signal, unsigned out_time, unsigned out_cha return 0; } -int conv1d(float* output_signal, unsigned out_time, unsigned out_channels, const float* input_signal, - unsigned in_time, unsigned in_channels, unsigned padding, unsigned kernel_size, +int conv1d(float* output_signal, unsigned out_time, unsigned out_channels, + const float* input_signal, unsigned in_time, unsigned in_channels, + unsigned padding, unsigned kernel_size, const void* params, unsigned stride, unsigned activation) { const ConvLayers_Params* tparams= (ConvLayers_Params*) params; @@ -331,8 +337,9 @@ int conv1d(float* output_signal, unsigned out_time, unsigned out_channels, const return 0; } -int conv1d_parallel(float* output_signal, unsigned out_time, unsigned out_channels, const float* input_signal, - unsigned in_time, unsigned in_channels, unsigned padding, unsigned kernel_size, +int conv1d_parallel(float* output_signal, unsigned out_time, unsigned out_channels, + const float* input_signal, unsigned in_time, unsigned in_channels, + unsigned padding, unsigned kernel_size, const void* params, unsigned stride, unsigned activation) { unsigned ncols = kernel_size * in_channels, num_iter = 0, num_steps_one_row = 0; @@ -347,7 +354,9 @@ int conv1d_parallel(float* output_signal, unsigned out_time, unsigned out_channe // Perform the Convolution. Pad is from 0 to padding and in_time + padding to in_time + 2 * padding // Buffer to hold the output. For corner cases, this will be realtively big. // But will be needed for the central condition (filter inside input). - unsigned buffer_steps = in_time / num_steps_one_row; + // If there are not enough time steps to linearise into one row, then allocate only 1 time step + unsigned buffer_steps = ((in_time / num_steps_one_row) > 1) ? + in_time / num_steps_one_row : 1; float* temp_out = (float*)malloc(buffer_steps * out_channels * sizeof(float)); unsigned t_in_start, t_in_end, t_out; // Values are needed outside the loops. Hence declared here for (t_in_start = 0, t_in_end = kernel_size - 1, t_out = 0; @@ -375,9 +384,9 @@ int conv1d_parallel(float* output_signal, unsigned out_time, unsigned out_channe // Hence we use the num_steps_one_row for calculating the number of time steps to be linearized in one row // Using the above logic, we can convert the MatVec opeartion into a MatMul operation // Ideally both implementation would be the same. However for edge devices the matMul was found to be faster matVec (both tilied) - // Skip if atleast 2 rows cannot be formed. The condition 2 * (kernel_size + stride) is the worst case criteria to form 2 rows + // Skip if atleast 2 rows cannot be formed. The condition 2 * num_steps_one_row + stride is the worst case criteria // The MatVec will be used for the computation in-case the following block is skipped - if (in_time > ((kernel_size + stride) << 1)) { + if (in_time > ((num_steps_one_row << 1) + stride)) { t_in_start -= padding; // remove the padding offset temporarily t_in_end -= padding; // Used to keep track of the final processed index for (unsigned iter = 0; (iter < num_iter) && (t_out < out_channels);