From f98d9d9ef60082f0ecb756e57279de058b9433f4 Mon Sep 17 00:00:00 2001
From: Anirudh0707 <yani.bh@gmail.com>
Date: Sun, 29 Nov 2020 17:56:16 +0530
Subject: [PATCH] Change short sequence threshold for parallel versions

---
 c_reference/include/conv1d.h | 30 ++++++++++++++++-------------
 c_reference/src/conv1d.c     | 37 ++++++++++++++++++++++--------------
 2 files changed, 40 insertions(+), 27 deletions(-)

diff --git a/c_reference/include/conv1d.h b/c_reference/include/conv1d.h
index b5f295ecf..a7ed49315 100644
--- a/c_reference/include/conv1d.h
+++ b/c_reference/include/conv1d.h
@@ -8,7 +8,7 @@
 
    NOTES for the conv layers
 -> The conv1d & conv1d_lr layers work for all cases and can be used unconstrained. 
-   There are no hard constraints for the parallel version, but a points regarding the optimal usage are given below
+   There are no hard constraints for the parallel version, but a few points regarding its optimal usage are given below
 -> Dilation = 1 (no dilation) for all cases
 -> For the non-depthwise cases, store the matrices as described below. Permutation might be necessary
 -> The low-rank decomposition cannot be applied to the depthwise weight matrices. This is due to the out_channels/in_channels = 0 constarint imposed by the depthwise convolution. 
@@ -22,10 +22,10 @@
 
    Important points regarding parallel versions
 -> Due to the above reason, the parallel layers is only recommended for large in_time inputs
-   This should typically be for in_time (without the padding) > 2 * (kernel_size + stride). Else there would not be enough time-steps to efficiently parallelize
-   For other shorter input cases, the code will skip the MatMul computation and use MatVec instead (but the MatMul-variable computation overhead would remain)
-   For such cases, the MatVec code (conv1d and conv1d_lr) would work more efficiently 
-   The RAM usage would be lower and the function would not have any overheads (calculation of the iterators and MatMul-auxiliary variables)
+   This should typically be for in_time (without the padding) > 2 * num_steps_one_row + stride. Else there would not be enough time-steps to efficiently parallelise
+   We need at least 2 rows for a good a MatMul performace. In the worst case the starting time step would be (stride - 1). Hence we choose 2 * num_steps_one_row + stride as the threshold
+   For the short input cases, the code will skip the MatMul computation and use MatVec instead (but the MatMul-variable computation overhead would remain)
+   For such cases, the MatVec code (conv1d and conv1d_lr) would work more efficiently due to the lower RAM usage and lack of any major overheads
 -> There is no support for depthwise for conv1d_parallel
    The regular convolution acts on all the channels while the depthwise acts only on one channel at a time
    This results in a non-contiguos memory access. MatMul would need to process multiple such time-steps, while the MatVec would only need to process one
@@ -66,8 +66,9 @@ typedef struct ConvLayers_Params {
  *                                2: tanh
  *                                3: relu
  */
-int conv1d(float* output_signal, unsigned out_time, unsigned out_channels, const float* input_signal,
-  unsigned in_time, unsigned in_channels, unsigned padding, unsigned kernel_size,
+int conv1d(float* output_signal, unsigned out_time, unsigned out_channels,
+  const float* input_signal, unsigned in_time, unsigned in_channels,
+  unsigned padding, unsigned kernel_size,
   const void* params, unsigned stride, unsigned activation);
 
 /**
@@ -102,8 +103,9 @@ typedef struct ConvLayers_Parallel_Params {
  *                                2: tanh
  *                                3: relu
  */
-int conv1d_parallel(float* output_signal, unsigned out_time, unsigned out_channels, const float* input_signal,
-  unsigned in_time, unsigned in_channels, unsigned padding, unsigned kernel_size,
+int conv1d_parallel(float* output_signal, unsigned out_time, unsigned out_channels,
+  const float* input_signal, unsigned in_time, unsigned in_channels,
+  unsigned padding, unsigned kernel_size,
   const void* params, unsigned stride, unsigned activation);
 
 /**
@@ -141,8 +143,9 @@ typedef struct ConvLayers_LR_Params {
  *                                2: tanh
  *                                3: relu
  */
-int conv1d_lr(float* output_signal, unsigned out_time, unsigned out_channels, const float* input_signal,
-  unsigned in_time, unsigned in_channels, unsigned padding, unsigned kernel_size,
+int conv1d_lr(float* output_signal, unsigned out_time, unsigned out_channels,
+  const float* input_signal, unsigned in_time, unsigned in_channels,
+  unsigned padding, unsigned kernel_size,
   const void* params, unsigned stride, unsigned activation);
 
 /**
@@ -184,8 +187,9 @@ typedef struct ConvLayers_LR_Parallel_Params {
  *                                2: tanh
  *                                3: relu
  */
-int conv1d_lr_parallel(float* output_signal, unsigned out_time, unsigned out_channels, const float* input_signal,
-  unsigned in_time, unsigned in_channels, unsigned padding, unsigned kernel_size,
+int conv1d_lr_parallel(float* output_signal, unsigned out_time, unsigned out_channels,
+  const float* input_signal, unsigned in_time, unsigned in_channels,
+  unsigned padding, unsigned kernel_size,
   const void* params, unsigned stride, unsigned activation);
 
 // Auxiliary Layers
diff --git a/c_reference/src/conv1d.c b/c_reference/src/conv1d.c
index 19bd73bdd..2ab5b7f30 100644
--- a/c_reference/src/conv1d.c
+++ b/c_reference/src/conv1d.c
@@ -7,8 +7,9 @@
 #include "conv1d.h"
 #include "utils.h"
 
-int conv1d_lr(float* output_signal, unsigned out_time, unsigned out_channels, const float* input_signal,
-  unsigned in_time, unsigned in_channels, unsigned padding, unsigned kernel_size,
+int conv1d_lr(float* output_signal, unsigned out_time, unsigned out_channels,
+  const float* input_signal, unsigned in_time, unsigned in_channels,
+  unsigned padding, unsigned kernel_size,
   const void* params, unsigned stride, unsigned activation) {
 
   const ConvLayers_LR_Params* tparams= (ConvLayers_LR_Params*) params;
@@ -96,8 +97,9 @@ int conv1d_lr(float* output_signal, unsigned out_time, unsigned out_channels, co
   return 0;
 }
 
-int conv1d_lr_parallel(float* output_signal, unsigned out_time, unsigned out_channels, const float* input_signal,
-  unsigned in_time, unsigned in_channels, unsigned padding, unsigned kernel_size,
+int conv1d_lr_parallel(float* output_signal, unsigned out_time, unsigned out_channels,
+  const float* input_signal, unsigned in_time, unsigned in_channels,
+  unsigned padding, unsigned kernel_size,
   const void* params, unsigned stride, unsigned activation) {
 
   unsigned ncols = kernel_size * in_channels, num_iter = 0, num_steps_one_row = 0;
@@ -112,7 +114,10 @@ int conv1d_lr_parallel(float* output_signal, unsigned out_time, unsigned out_cha
   // Perform the convolution. Zero-pad is from 0 to padding and in_time + padding to in_time + 2 * padding
   // Buffer to hold the output. For corner cases, this will be realtively big. 
   // But will be needed for the central condition (filter inside input).
-  unsigned buffer_steps = in_time / num_steps_one_row, rank = tparams->rank;
+  // If there are not enough time steps to linearise into one row, then allocate only 1 time step
+  unsigned buffer_steps = ((in_time / num_steps_one_row) > 1) ? 
+                            in_time / num_steps_one_row : 1;
+  unsigned rank = tparams->rank;
   // Buffer for W2 out
   float* temp_rank_out = (float*)malloc(buffer_steps * rank * sizeof(float));
   // Buffer for W1 out
@@ -147,9 +152,9 @@ int conv1d_lr_parallel(float* output_signal, unsigned out_time, unsigned out_cha
   // Hence we use the num_steps_one_row for calculating the number of time steps to be linearized in one row
   // Using the above logic, we can convert the MatVec opeartion into a MatMul operation
   // Ideally both implementation would be the same. However for edge devices the matMul was found to be faster matVec (both tilied)
-  // Skip if atleast 2 rows cannot be formed. The condition 2 * (kernel_size + stride) is the worst case criteria to form 2 rows
+  // Skip if atleast 2 rows cannot be formed. The condition 2 * num_steps_one_row + stride is the worst case criteria
   // The MatVec will be used for the computation in-case the following block is skipped
-  if (in_time > ((kernel_size + stride) << 1)) {
+  if (in_time > ((num_steps_one_row << 1) + stride)) {
     t_in_start -= padding; // remove the padding offset temporarily
     t_in_end -= padding; // Used to keep track of the final processed index
     for (unsigned iter = 0; (iter < num_iter) && (t_out < out_channels);
@@ -251,8 +256,9 @@ int conv1d_lr_parallel(float* output_signal, unsigned out_time, unsigned out_cha
   return 0;
 }
 
-int conv1d(float* output_signal, unsigned out_time, unsigned out_channels, const float* input_signal,
-  unsigned in_time, unsigned in_channels, unsigned padding, unsigned kernel_size,
+int conv1d(float* output_signal, unsigned out_time, unsigned out_channels,
+  const float* input_signal, unsigned in_time, unsigned in_channels,
+  unsigned padding, unsigned kernel_size,
   const void* params, unsigned stride, unsigned activation) {
 
   const ConvLayers_Params* tparams= (ConvLayers_Params*) params;
@@ -331,8 +337,9 @@ int conv1d(float* output_signal, unsigned out_time, unsigned out_channels, const
   return 0;
 }
 
-int conv1d_parallel(float* output_signal, unsigned out_time, unsigned out_channels, const float* input_signal,
-  unsigned in_time, unsigned in_channels, unsigned padding, unsigned kernel_size,
+int conv1d_parallel(float* output_signal, unsigned out_time, unsigned out_channels,
+  const float* input_signal, unsigned in_time, unsigned in_channels,
+  unsigned padding, unsigned kernel_size,
   const void* params, unsigned stride, unsigned activation) {
   
   unsigned ncols = kernel_size * in_channels, num_iter = 0, num_steps_one_row = 0;
@@ -347,7 +354,9 @@ int conv1d_parallel(float* output_signal, unsigned out_time, unsigned out_channe
   // Perform the Convolution. Pad is from 0 to padding and in_time + padding to in_time + 2 * padding
   // Buffer to hold the output. For corner cases, this will be realtively big. 
   // But will be needed for the central condition (filter inside input).
-  unsigned buffer_steps = in_time / num_steps_one_row;
+  // If there are not enough time steps to linearise into one row, then allocate only 1 time step
+  unsigned buffer_steps = ((in_time / num_steps_one_row) > 1) ? 
+                            in_time / num_steps_one_row : 1;
   float* temp_out = (float*)malloc(buffer_steps * out_channels * sizeof(float));
   unsigned t_in_start, t_in_end, t_out; // Values are needed outside the loops. Hence declared here
   for (t_in_start = 0, t_in_end = kernel_size - 1, t_out = 0; 
@@ -375,9 +384,9 @@ int conv1d_parallel(float* output_signal, unsigned out_time, unsigned out_channe
   // Hence we use the num_steps_one_row for calculating the number of time steps to be linearized in one row
   // Using the above logic, we can convert the MatVec opeartion into a MatMul operation
   // Ideally both implementation would be the same. However for edge devices the matMul was found to be faster matVec (both tilied)
-  // Skip if atleast 2 rows cannot be formed. The condition 2 * (kernel_size + stride) is the worst case criteria to form 2 rows
+  // Skip if atleast 2 rows cannot be formed. The condition 2 * num_steps_one_row + stride is the worst case criteria
   // The MatVec will be used for the computation in-case the following block is skipped
-  if (in_time > ((kernel_size + stride) << 1)) {
+  if (in_time > ((num_steps_one_row << 1) + stride)) {
     t_in_start -= padding; // remove the padding offset temporarily
     t_in_end -= padding; // Used to keep track of the final processed index
     for (unsigned iter = 0; (iter < num_iter) && (t_out < out_channels);