diff --git a/docs/design_docs/dynamic_point_to_voxel_backward/dynamic_point_to_voxel_backward.md b/docs/design_docs/dynamic_point_to_voxel_backward/dynamic_point_to_voxel_backward.md index c95666696..e565284b5 100644 --- a/docs/design_docs/dynamic_point_to_voxel_backward/dynamic_point_to_voxel_backward.md +++ b/docs/design_docs/dynamic_point_to_voxel_backward/dynamic_point_to_voxel_backward.md @@ -11,6 +11,7 @@ | 版本号 | 修订人 | 修订日期 | 修订描述 | | ----- | ----- | ------ | ------- | | V1.0 | xuminjie | 2023 | 首次提交 | +| V1.1 | wangyuan | 2024.11.08 | 修复sync、算法时序引入的潜在缺陷 | - #### 内容描述 @@ -80,10 +81,10 @@ max模式:根据point2voxel_map,分组找出feats和voxel_feats中值相同 | 输入限制 | 输入 `grad_voxel_feats`, `feats`, `voxel_feats`支持输入 nan 或 inf | | 输入参数限制 | 仅支持输入reduce_mode值为MLUOP_REDUCEMODE_MAX | | 数据类型限制 | 输入 `grad_voxel_feats`, `feats`, `voxel_feats` 输出 `grad_feats` 数据类型保持一致;`point2voxel_map`, `voxel_points_count`, `voxel_num`数据类型保持一致 | -| 布局限制 | 无 | -| 原位限制 | 不支持原位 | +| 布局限制 | 无 | +| 原位限制 | 不支持原位 | | stride 限制 | 不支持 stride 机制 | -| 广播限制 | 不支持广播 | +| 广播限制 | 不支持广播 | ### 1.5 验收标准 @@ -167,7 +168,26 @@ mluOpStatus_t MLUOP_WIN_API mluOpGetDynamicPointToVoxelBackwardWorkspaceSize( #### 3.1.1 计算原理说明 -`dynamic_point_to_voxel_backward` 算子包含7个输入:`reduce_type`、`grad_voxel_feats`、`feats`、`voxel_feats`、`point2voxel_map`、`voxel_point2_count`、`voxel_num`,1个输出:`grad_feats`; 根据 1.2 节算子功能, 可将算子2部分分为2个kernel来实现: +`dynamic_point_to_voxel_backward` 算子包含7个输入,1个输出。 + +input: +1. `grad_feats`,`shape=[N,C]` +2. `grad_voxel_feats`,`shape=[M,C]` +3. `voxel_feats`,`shape=[M,C]` +4. `point2voxel_map`,`shape=[N]` +5. `voxel_point2_count`,`shape=[M]` +6. `voxel_num`,`hape=[1]` +7. `reduce_type` + +output: +1. `grad_feats`,`shape=[N,C]` + +`dynamic_point_to_voxel_forward` 中: +- `coor` 表示 N 个点云数据对应在三维(三维体素网格坐标)具体坐标信息,`feats` 表示有 N 个点云数据,每个点云有 C 个特征 +- `feats`、`coors` 中数据是一一对应的 +- 该算子将全正坐标外坐标刷-1、去重、排序后,通过 `reduce_mode` ,对 `feats` 中数据进行处理 + +根据 1.2 节算子功能,可将`dynamic_point_to_voxel_forward` 分为2个kernel来实现: - #### 计算逻辑层面 @@ -176,7 +196,22 @@ mluOpStatus_t MLUOP_WIN_API mluOpGetDynamicPointToVoxelBackwardWorkspaceSize( 先将`voxel_from`初始化成最大值N; -根据`point2voxel_map`中记录的“特征与体素特征的映射关系”。对比输入的特征`feats`和体素特征 `voxel_feats`,对于第i个体素特征`voxel_feats[i]`,如果第j个特征与之相等,则认为这个体素特征是由该特征得到的。在中间结果`voxel_from`中保存两者的下标关系,使`voxel_from[i]=j`,该中间结果使用workspace保存。 +根据`point2voxel_map`中记录的“特征与体素特征的映射关系”。对比输入的特征`feats`和体素特征 `voxel_feats`: +1. 对于第 `i` 个体素特征中 `c(c=0,1,2,...,C-1)` 维特征 `voxel_feats[i,c]`,若与第 `j` 个特征的 `c` 维特征的 `feats[j, c]`相等,则认为这个体素特征是由该特征得到的 +2. 若 `voxel_feats[i,c]` 与多个特征 `feats[j, c]`、`feats[k, c]` 相等,此时认为下标靠前的特征`feats[j, c]`(`jqueue, feats, voxel_feats, - grad_feats, workspace, point2voxel_map, voxel_num, N, C)); + workspace, point2voxel_map, voxel_num, N, C)); // 4. scatter cnnlScatterNdMode_t scatter_mode = CNNL_SCATTERND_ADD; mluOpTensorDescriptor_t updates_desc; diff --git a/kernels/dynamic_point_to_voxel/dynamic_point_to_voxel_backward/dynamic_point_to_voxel_backward.h b/kernels/dynamic_point_to_voxel/dynamic_point_to_voxel_backward/dynamic_point_to_voxel_backward.h index 2a4df91b4..6431a589d 100644 --- a/kernels/dynamic_point_to_voxel/dynamic_point_to_voxel_backward/dynamic_point_to_voxel_backward.h +++ b/kernels/dynamic_point_to_voxel/dynamic_point_to_voxel_backward/dynamic_point_to_voxel_backward.h @@ -29,9 +29,9 @@ DYNAMIC_POINT_TO_VOXEL_BACKWARD_H mluOpStatus_t MLUOP_WIN_API KernelDynamicPointToVoxelBackward( cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, - const void *feats, const void *voxel_feats, void *grad_feats, - void *voxel_from, const void *point2voxel_map, const void *voxel_num, - const int N, const int C); + const void *feats, const void *voxel_feats, void *voxel_from, + const void *point2voxel_map, const void *voxel_num, const int N, + const int C); #endif // KERNELS_DYNAMIC_POINT_TO_VOXEL_BACKWARD_ // DYNAMIC_POINT_TO_VOXEL_FORWARD_H diff --git a/kernels/dynamic_point_to_voxel/dynamic_point_to_voxel_backward/dynamic_point_to_voxel_backward_union1.mlu b/kernels/dynamic_point_to_voxel/dynamic_point_to_voxel_backward/dynamic_point_to_voxel_backward_union1.mlu index c114aca07..708aaa856 100644 --- a/kernels/dynamic_point_to_voxel/dynamic_point_to_voxel_backward/dynamic_point_to_voxel_backward_union1.mlu +++ b/kernels/dynamic_point_to_voxel/dynamic_point_to_voxel_backward/dynamic_point_to_voxel_backward_union1.mlu @@ -31,171 +31,183 @@ __nram__ int8_t nram_buffer[MAX_NRAM_SIZE]; template __mlu_func__ void loadAsync(T *feats_nram, T *voxel_feats_nram, - int *index_mask_nram, int *voxel_from_nram, - int *point2voxel_map_real_nram, - const int *point2voxel_map_nram, - const int *index_col_nram, const T *feats, - const T *voxel_feats, const int *voxel_from, int &x, - int &n_real, const int n_limit, const int N, - const int C) { - int invalid_index = -1; - int size_feats = C * sizeof(T); - int size_feats_idx = C * sizeof(int); - n_real = 0; - for (; x < N && n_real < n_limit; x++) { - int point_to = point2voxel_map_nram[x]; - int input_offset = x * C; - int input_real_offset = n_real * C; + int *feats_index_nram, int *voxel_from_nram, + int *map_curr_ipu, const int *map_global, + const int *dim_c_idx, const T *feats, + const T *voxel_feats, const int *voxel_from, + int &n_global, int &n_curr_ipu, + const int n_deal_num, const int N, const int C) { + const int invalid_index = -1; + const int size_feats = C * sizeof(T); + const int size_feats_idx = C * sizeof(int); + n_curr_ipu = 0; + for (; n_global < N && n_curr_ipu < n_deal_num; ++n_global) { + // calculate offset + int gdram_voxel_feat_offset; + const int gdram_feat_offset = n_global * C; + const int nram_offset = n_curr_ipu * C; + + const int point_to = map_global[n_global]; if (taskId == point_to % taskDim) { if (point_to == invalid_index) { continue; } - int reduced_offset = point_to * C; - // load valid data to feats_nram - __memcpy_async(feats_nram + input_real_offset, feats + input_offset, + gdram_voxel_feat_offset = point_to * C; + // load feats + // feats_nram = [feats[0],feats[1],...,feats[n_curr_ipu-1]] + __memcpy_async(feats_nram + nram_offset, feats + gdram_feat_offset, size_feats, GDRAM2NRAM); - // boradcast voxel_feats data to voxel_feats_nram via the same "point_to" - __memcpy_async(voxel_feats_nram + input_real_offset, - voxel_feats + reduced_offset, size_feats, GDRAM2NRAM); - // boradcast voxel_from data to voxel_from_nram via the same "point_to" - __memcpy_async(voxel_from_nram + input_real_offset, - voxel_from + reduced_offset, size_feats_idx, GDRAM2NRAM); - // record valid index of x in index_mask_nram - __bang_write_value(index_mask_nram + input_real_offset, C, x * C); + // load voxel_feats + // voxel_feats_nram = [voxel_feats[0],voxel_feats[0],voxel_feats[1],...] + // when map = [0,0,1...] + __memcpy_async(voxel_feats_nram + nram_offset, + voxel_feats + gdram_voxel_feat_offset, size_feats, + GDRAM2NRAM); + + // load voxel2point + __memcpy_async(voxel_from_nram + nram_offset, + voxel_from + gdram_voxel_feat_offset, size_feats_idx, + GDRAM2NRAM); + + // set feat-points index + __bang_write_value(feats_index_nram + nram_offset, C, n_global * C); + // point2voxel_map removed invalid data - point2voxel_map_real_nram[n_real] = point_to; - ++n_real; + map_curr_ipu[n_curr_ipu] = point_to; + ++n_curr_ipu; } } - if (n_real > 0) { - __bang_cycle_add(index_mask_nram, index_mask_nram, index_col_nram, - n_real * C, C); + if (n_curr_ipu > 0) { + // update feat-points index + __bang_cycle_add(feats_index_nram, feats_index_nram, dim_c_idx, + n_curr_ipu * C, C); } } template __mlu_func__ void compute(T *feats_nram, T *voxel_feats_nram, - int *index_mask_nram, int *voxel_from_nram, - const int n_real, const int N, const int C) { - if (n_real > 0) { - // view [n_real, C] as [n_real * C] - int deal_num = n_real * C; - // if (feats[i] == voxel_feats[i]) {mask[i] = 1} else {mask[i] = 0} + int *feats_index_nram, int *voxel_from_nram, + const int n_curr_ipu, const int N, const int C) { + if (n_curr_ipu > 0) { + // feats[i] == voxel_feats[i] ? mask[i] = 1 : mask[i] = 0 + const int deal_num = n_curr_ipu * C; __bang_eq(feats_nram, voxel_feats_nram, feats_nram, deal_num); - // change mask1's dtype to int32 __bang_float2int32_tz((int *)feats_nram, feats_nram, deal_num, 0); - // mask2 = NOT mask1 + + // recover feats_index (local->global) + // recover !mask to N*C __bang_not((int *)voxel_feats_nram, (int *)feats_nram, deal_num); - // choose index of "feats[i] == voxel_feats[i]" - __bang_mul((int *)feats_nram, (int *)feats_nram, index_mask_nram, deal_num); - // mask2 *= N * C + __bang_mul((int *)feats_nram, (int *)feats_nram, feats_index_nram, + deal_num); __bang_mul_scalar((int *)voxel_feats_nram, (int *)voxel_feats_nram, N * C, deal_num); - // mix choosed index and 'N * C' - __bang_add(index_mask_nram, (int *)voxel_feats_nram, (int *)feats_nram, + + // mix mask and !mask, and choose the min index + __bang_add(feats_index_nram, (int *)voxel_feats_nram, (int *)feats_nram, deal_num); - // choose the min index - __bang_minequal(voxel_from_nram, voxel_from_nram, index_mask_nram, + __bang_minequal(voxel_from_nram, voxel_from_nram, feats_index_nram, deal_num); } } __mlu_func__ void storeAsync(int *voxel_from, const int *voxel_from_nram, - const int *point2voxel_map_real_nram, - bool *voxel_from_flag_nram, int *index_mask_nram, - const int n_real, const int N, const int C) { - int size_feats_idx = C * sizeof(int); - for (int i = 0; i < n_real; i++) { - int offset_real = point2voxel_map_real_nram[i]; - // 1) use atomicmin, too slow - // __bang_atomic_reduce_min(voxel_from + offset_real * C, - // voxel_from_nram + i * C, C); - // 2) compare one by one, use voxel_from_flag_nram as flags to record - // whether dst idx has appeard - if (voxel_from_flag_nram[offset_real] == false) { - // if number of grad idx on offset_real == 1, use the idx value directly - __memcpy_async(voxel_from + offset_real * C, voxel_from_nram + i * C, - size_feats_idx, NRAM2GDRAM); - // set voxel_from_flag to true - voxel_from_flag_nram[offset_real] = true; - } else { - __sync_io(); - // load the idx appeard - __memcpy(index_mask_nram, voxel_from + offset_real * C, size_feats_idx, - GDRAM2NRAM); - // if number of grad idx on offset_real > 1, pick the min idx value - __bang_minequal(index_mask_nram, index_mask_nram, voxel_from_nram + i * C, - C); - // store the new idx - __memcpy(voxel_from + offset_real * C, index_mask_nram, size_feats_idx, - NRAM2GDRAM); + const int *map_curr_ipu, bool *voxel_count_flag, + int *feats_index_nram, const int n_curr_ipu, + const int N, const int C) { + for (int i = 0; i < n_curr_ipu; i++) { +#if __BANG_ARCH__ >= 592 + // better performance for mlu590 + __bang_atomic_reduce_min(voxel_from + map_curr_ipu[i] * C, + voxel_from_nram + i * C, C); +#else + const int offset_local = map_curr_ipu[i]; + if (taskId == offset_local % taskDim) { + if (!voxel_count_flag[offset_local]) { + __memcpy(voxel_from + offset_local * C, voxel_from_nram + i * C, + C * sizeof(int), NRAM2GDRAM); + voxel_count_flag[offset_local] = true; + } else { + __memcpy(feats_index_nram, voxel_from + offset_local * C, + C * sizeof(int), GDRAM2NRAM); + __bang_minequal(feats_index_nram, feats_index_nram, + voxel_from_nram + i * C, C); + __memcpy(voxel_from + offset_local * C, feats_index_nram, + C * sizeof(int), NRAM2GDRAM); + } } +#endif } } template __mlu_global__ void MLUKernelMaxReduceTracebackScatterIdx( - const T *feats, const T *voxel_feats, T *grad_feats, int *voxel_from, + const T *feats, const T *voxel_feats, int *voxel_from, const int *point2voxel_map, const int *voxel_num, const int N, const int C) { const int M = *voxel_num; - if (M == 0) { + if (__is_mpu() || M == 0) { return; } - int size_input = N * sizeof(int); - int size_reduced_flag = M * sizeof(bool); - int size_feats = C * sizeof(T); - int size_feats_idx = C * sizeof(int); - - int nram_size = MAX_NRAM_SIZE; - int n_limit = (nram_size - size_input - size_reduced_flag - size_feats_idx) / - (2 * size_feats + 2 * size_feats_idx + sizeof(int)); - int feats_limit = n_limit * C; - - T *feats_nram = (T *)nram_buffer; // [n_limit, C] - T *voxel_feats_nram = feats_nram + feats_limit; // [n_limit, C] - int *index_mask_nram = - (int *)(voxel_feats_nram + feats_limit); // [n_limit, C] - int *voxel_from_nram = index_mask_nram + feats_limit; // [n_limit, C] - int *point2voxel_map_nram = voxel_from_nram + feats_limit; // [N] - int *point2voxel_map_real_nram = point2voxel_map_nram + N; // [n_limit] - bool *voxel_from_flag_nram = - (bool *)(point2voxel_map_real_nram + n_limit); // [M] - int *index_col_nram = (int *)(voxel_from_flag_nram + M); // [C] - - __sync_all(); - - // broadcast point2voxel_map to nram - __memcpy(point2voxel_map_nram, point2voxel_map, size_input, GDRAM2NRAM); - // initialze voxel_from_flag to false - __memset_nram(voxel_from_flag_nram, M, (int8_t) false); + + /* + * NRAM partition + * |==================|============================| + * | Semantics | Size | + * |==================|============================| + * | feats | [n_deal_num, C], float | + * | voxel_feats | [n_deal_num, C], float | + * | index_mask | [n_deal_num, C], int | + * | voxel_from | [n_deal_num, C], int | + * | map_curr_ipu | [n_deal_num], int | + * | map_global | [N], int | + * | dim_c_idx | [C], int | + * | voxel_count_flag | [M], bool | + * |==================|============================| + */ + const int n_deal_num = + (MAX_NRAM_SIZE - N * sizeof(int) - M - C * sizeof(int)) / + (2 * C * sizeof(T) + 2 * C * sizeof(int) + sizeof(int)); + const int feats_num = n_deal_num * C; + + T *feats_nram = (T *)nram_buffer; + T *voxel_feats_nram = feats_nram + feats_num; + int *feats_index_nram = (int *)(voxel_feats_nram + feats_num); + int *voxel_from_nram = feats_index_nram + feats_num; + int *map_global = voxel_from_nram + feats_num; + int *map_curr_ipu = map_global + N; + int *dim_c_idx = map_curr_ipu + n_deal_num; + bool *voxel_count_flag = (bool *)(dim_c_idx + C); + + // load point2voxel_map & init voxel_count_flag + __memcpy(map_global, point2voxel_map, N * sizeof(int), GDRAM2NRAM); + __memset_nram(voxel_count_flag, M, (int8_t) false); + + // init dim_c_idx: 0,1,2,...,C-1 for (int i = 0; i < C; i++) { - index_col_nram[i] = i; + dim_c_idx[i] = i; } - for (int x = 0, n_real = 0; x < N;) { - // load data, get x and n_real - loadAsync(feats_nram, voxel_feats_nram, index_mask_nram, voxel_from_nram, - point2voxel_map_real_nram, point2voxel_map_nram, index_col_nram, - feats, voxel_feats, voxel_from, x, n_real, n_limit, N, C); + + for (int n_global = 0, n_curr_ipu = 0; n_global < N;) { + loadAsync(feats_nram, voxel_feats_nram, feats_index_nram, voxel_from_nram, + map_curr_ipu, map_global, dim_c_idx, feats, voxel_feats, + voxel_from, n_global, n_curr_ipu, n_deal_num, N, C); + __sync(); + compute(feats_nram, voxel_feats_nram, feats_index_nram, voxel_from_nram, + n_curr_ipu, N, C); __sync(); - // compute - compute(feats_nram, voxel_feats_nram, index_mask_nram, voxel_from_nram, - n_real, N, C); - // store - storeAsync(voxel_from, voxel_from_nram, point2voxel_map_real_nram, - voxel_from_flag_nram, index_mask_nram, n_real, N, C); + storeAsync(voxel_from, voxel_from_nram, map_curr_ipu, voxel_count_flag, + feats_index_nram, n_curr_ipu, N, C); __sync(); } } mluOpStatus_t MLUOP_WIN_API KernelDynamicPointToVoxelBackward( cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, - const void *feats, const void *voxel_feats, void *grad_feats, - void *voxel_from, const void *point2voxel_map, const void *voxel_num, - const int N, const int C) { + const void *feats, const void *voxel_feats, void *voxel_from, + const void *point2voxel_map, const void *voxel_num, const int N, + const int C) { KERNEL_CHECK(MLUKernelMaxReduceTracebackScatterIdx<<>>( - (const float *)feats, (const float *)voxel_feats, (float *)grad_feats, - (int *)voxel_from, (const int *)point2voxel_map, (const int *)voxel_num, - N, C)); + (const float *)feats, (const float *)voxel_feats, (int *)voxel_from, + (const int *)point2voxel_map, (const int *)voxel_num, N, C)); return MLUOP_STATUS_SUCCESS; }