Skip to content

Commit

Permalink
Change short sequence threshold for parallel versions
Browse files Browse the repository at this point in the history
  • Loading branch information
Anirudh0707 committed Nov 29, 2020
1 parent 2b712a1 commit f98d9d9
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 27 deletions.
30 changes: 17 additions & 13 deletions c_reference/include/conv1d.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
NOTES for the conv layers
-> The conv1d & conv1d_lr layers work for all cases and can be used unconstrained.
There are no hard constraints for the parallel version, but a points regarding the optimal usage are given below
There are no hard constraints for the parallel version, but a few points regarding its optimal usage are given below
-> Dilation = 1 (no dilation) for all cases
-> For the non-depthwise cases, store the matrices as described below. Permutation might be necessary
-> The low-rank decomposition cannot be applied to the depthwise weight matrices. This is due to the out_channels/in_channels = 0 constarint imposed by the depthwise convolution.
Expand All @@ -22,10 +22,10 @@
Important points regarding parallel versions
-> Due to the above reason, the parallel layers is only recommended for large in_time inputs
This should typically be for in_time (without the padding) > 2 * (kernel_size + stride). Else there would not be enough time-steps to efficiently parallelize
For other shorter input cases, the code will skip the MatMul computation and use MatVec instead (but the MatMul-variable computation overhead would remain)
For such cases, the MatVec code (conv1d and conv1d_lr) would work more efficiently
The RAM usage would be lower and the function would not have any overheads (calculation of the iterators and MatMul-auxiliary variables)
This should typically be for in_time (without the padding) > 2 * num_steps_one_row + stride. Else there would not be enough time-steps to efficiently parallelise
We need at least 2 rows for a good a MatMul performace. In the worst case the starting time step would be (stride - 1). Hence we choose 2 * num_steps_one_row + stride as the threshold
For the short input cases, the code will skip the MatMul computation and use MatVec instead (but the MatMul-variable computation overhead would remain)
For such cases, the MatVec code (conv1d and conv1d_lr) would work more efficiently due to the lower RAM usage and lack of any major overheads
-> There is no support for depthwise for conv1d_parallel
The regular convolution acts on all the channels while the depthwise acts only on one channel at a time
This results in a non-contiguos memory access. MatMul would need to process multiple such time-steps, while the MatVec would only need to process one
Expand Down Expand Up @@ -66,8 +66,9 @@ typedef struct ConvLayers_Params {
* 2: tanh
* 3: relu
*/
int conv1d(float* output_signal, unsigned out_time, unsigned out_channels, const float* input_signal,
unsigned in_time, unsigned in_channels, unsigned padding, unsigned kernel_size,
int conv1d(float* output_signal, unsigned out_time, unsigned out_channels,
const float* input_signal, unsigned in_time, unsigned in_channels,
unsigned padding, unsigned kernel_size,
const void* params, unsigned stride, unsigned activation);

/**
Expand Down Expand Up @@ -102,8 +103,9 @@ typedef struct ConvLayers_Parallel_Params {
* 2: tanh
* 3: relu
*/
int conv1d_parallel(float* output_signal, unsigned out_time, unsigned out_channels, const float* input_signal,
unsigned in_time, unsigned in_channels, unsigned padding, unsigned kernel_size,
int conv1d_parallel(float* output_signal, unsigned out_time, unsigned out_channels,
const float* input_signal, unsigned in_time, unsigned in_channels,
unsigned padding, unsigned kernel_size,
const void* params, unsigned stride, unsigned activation);

/**
Expand Down Expand Up @@ -141,8 +143,9 @@ typedef struct ConvLayers_LR_Params {
* 2: tanh
* 3: relu
*/
int conv1d_lr(float* output_signal, unsigned out_time, unsigned out_channels, const float* input_signal,
unsigned in_time, unsigned in_channels, unsigned padding, unsigned kernel_size,
int conv1d_lr(float* output_signal, unsigned out_time, unsigned out_channels,
const float* input_signal, unsigned in_time, unsigned in_channels,
unsigned padding, unsigned kernel_size,
const void* params, unsigned stride, unsigned activation);

/**
Expand Down Expand Up @@ -184,8 +187,9 @@ typedef struct ConvLayers_LR_Parallel_Params {
* 2: tanh
* 3: relu
*/
int conv1d_lr_parallel(float* output_signal, unsigned out_time, unsigned out_channels, const float* input_signal,
unsigned in_time, unsigned in_channels, unsigned padding, unsigned kernel_size,
int conv1d_lr_parallel(float* output_signal, unsigned out_time, unsigned out_channels,
const float* input_signal, unsigned in_time, unsigned in_channels,
unsigned padding, unsigned kernel_size,
const void* params, unsigned stride, unsigned activation);

// Auxiliary Layers
Expand Down
37 changes: 23 additions & 14 deletions c_reference/src/conv1d.c
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,9 @@
#include "conv1d.h"
#include "utils.h"

int conv1d_lr(float* output_signal, unsigned out_time, unsigned out_channels, const float* input_signal,
unsigned in_time, unsigned in_channels, unsigned padding, unsigned kernel_size,
int conv1d_lr(float* output_signal, unsigned out_time, unsigned out_channels,
const float* input_signal, unsigned in_time, unsigned in_channels,
unsigned padding, unsigned kernel_size,
const void* params, unsigned stride, unsigned activation) {

const ConvLayers_LR_Params* tparams= (ConvLayers_LR_Params*) params;
Expand Down Expand Up @@ -96,8 +97,9 @@ int conv1d_lr(float* output_signal, unsigned out_time, unsigned out_channels, co
return 0;
}

int conv1d_lr_parallel(float* output_signal, unsigned out_time, unsigned out_channels, const float* input_signal,
unsigned in_time, unsigned in_channels, unsigned padding, unsigned kernel_size,
int conv1d_lr_parallel(float* output_signal, unsigned out_time, unsigned out_channels,
const float* input_signal, unsigned in_time, unsigned in_channels,
unsigned padding, unsigned kernel_size,
const void* params, unsigned stride, unsigned activation) {

unsigned ncols = kernel_size * in_channels, num_iter = 0, num_steps_one_row = 0;
Expand All @@ -112,7 +114,10 @@ int conv1d_lr_parallel(float* output_signal, unsigned out_time, unsigned out_cha
// Perform the convolution. Zero-pad is from 0 to padding and in_time + padding to in_time + 2 * padding
// Buffer to hold the output. For corner cases, this will be realtively big.
// But will be needed for the central condition (filter inside input).
unsigned buffer_steps = in_time / num_steps_one_row, rank = tparams->rank;
// If there are not enough time steps to linearise into one row, then allocate only 1 time step
unsigned buffer_steps = ((in_time / num_steps_one_row) > 1) ?
in_time / num_steps_one_row : 1;
unsigned rank = tparams->rank;
// Buffer for W2 out
float* temp_rank_out = (float*)malloc(buffer_steps * rank * sizeof(float));
// Buffer for W1 out
Expand Down Expand Up @@ -147,9 +152,9 @@ int conv1d_lr_parallel(float* output_signal, unsigned out_time, unsigned out_cha
// Hence we use the num_steps_one_row for calculating the number of time steps to be linearized in one row
// Using the above logic, we can convert the MatVec opeartion into a MatMul operation
// Ideally both implementation would be the same. However for edge devices the matMul was found to be faster matVec (both tilied)
// Skip if atleast 2 rows cannot be formed. The condition 2 * (kernel_size + stride) is the worst case criteria to form 2 rows
// Skip if atleast 2 rows cannot be formed. The condition 2 * num_steps_one_row + stride is the worst case criteria
// The MatVec will be used for the computation in-case the following block is skipped
if (in_time > ((kernel_size + stride) << 1)) {
if (in_time > ((num_steps_one_row << 1) + stride)) {
t_in_start -= padding; // remove the padding offset temporarily
t_in_end -= padding; // Used to keep track of the final processed index
for (unsigned iter = 0; (iter < num_iter) && (t_out < out_channels);
Expand Down Expand Up @@ -251,8 +256,9 @@ int conv1d_lr_parallel(float* output_signal, unsigned out_time, unsigned out_cha
return 0;
}

int conv1d(float* output_signal, unsigned out_time, unsigned out_channels, const float* input_signal,
unsigned in_time, unsigned in_channels, unsigned padding, unsigned kernel_size,
int conv1d(float* output_signal, unsigned out_time, unsigned out_channels,
const float* input_signal, unsigned in_time, unsigned in_channels,
unsigned padding, unsigned kernel_size,
const void* params, unsigned stride, unsigned activation) {

const ConvLayers_Params* tparams= (ConvLayers_Params*) params;
Expand Down Expand Up @@ -331,8 +337,9 @@ int conv1d(float* output_signal, unsigned out_time, unsigned out_channels, const
return 0;
}

int conv1d_parallel(float* output_signal, unsigned out_time, unsigned out_channels, const float* input_signal,
unsigned in_time, unsigned in_channels, unsigned padding, unsigned kernel_size,
int conv1d_parallel(float* output_signal, unsigned out_time, unsigned out_channels,
const float* input_signal, unsigned in_time, unsigned in_channels,
unsigned padding, unsigned kernel_size,
const void* params, unsigned stride, unsigned activation) {

unsigned ncols = kernel_size * in_channels, num_iter = 0, num_steps_one_row = 0;
Expand All @@ -347,7 +354,9 @@ int conv1d_parallel(float* output_signal, unsigned out_time, unsigned out_channe
// Perform the Convolution. Pad is from 0 to padding and in_time + padding to in_time + 2 * padding
// Buffer to hold the output. For corner cases, this will be realtively big.
// But will be needed for the central condition (filter inside input).
unsigned buffer_steps = in_time / num_steps_one_row;
// If there are not enough time steps to linearise into one row, then allocate only 1 time step
unsigned buffer_steps = ((in_time / num_steps_one_row) > 1) ?
in_time / num_steps_one_row : 1;
float* temp_out = (float*)malloc(buffer_steps * out_channels * sizeof(float));
unsigned t_in_start, t_in_end, t_out; // Values are needed outside the loops. Hence declared here
for (t_in_start = 0, t_in_end = kernel_size - 1, t_out = 0;
Expand Down Expand Up @@ -375,9 +384,9 @@ int conv1d_parallel(float* output_signal, unsigned out_time, unsigned out_channe
// Hence we use the num_steps_one_row for calculating the number of time steps to be linearized in one row
// Using the above logic, we can convert the MatVec opeartion into a MatMul operation
// Ideally both implementation would be the same. However for edge devices the matMul was found to be faster matVec (both tilied)
// Skip if atleast 2 rows cannot be formed. The condition 2 * (kernel_size + stride) is the worst case criteria to form 2 rows
// Skip if atleast 2 rows cannot be formed. The condition 2 * num_steps_one_row + stride is the worst case criteria
// The MatVec will be used for the computation in-case the following block is skipped
if (in_time > ((kernel_size + stride) << 1)) {
if (in_time > ((num_steps_one_row << 1) + stride)) {
t_in_start -= padding; // remove the padding offset temporarily
t_in_end -= padding; // Used to keep track of the final processed index
for (unsigned iter = 0; (iter < num_iter) && (t_out < out_channels);
Expand Down

0 comments on commit f98d9d9

Please sign in to comment.