From 5c1d4ceb8d26629ab9a31b6f3db9f3cd9b8b82fa Mon Sep 17 00:00:00 2001 From: mahxn0 <1262384588@qq.com> Date: Wed, 25 Dec 2024 11:11:33 +0800 Subject: [PATCH 1/3] [Docs](mlu-ops): update README. (#1189) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index b8856f319..c241f496a 100644 --- a/README.md +++ b/README.md @@ -141,7 +141,7 @@ MLU-OPS™提供了以下功能: ## 编译、开发与测试 提供基于 BANG C 的算子开发教程,涵盖算子入门、算子进阶、算子高级篇,帮助开发者迅速上手算子开发。 -具体见[ BANG C 算子开发指南](./docs/BANG%20C%20OPS-Develop-Guide.md)。 +具体见[ BANG C 算子开发指南](./docs/BANG-C-OPS-Develop-Guide.md)。 提供基于寒武纪人工智能单元(MLU)开发高性能算子、C 接口封装的示例代码。 MLU-OPS™ 具体的编译、开发与测试介绍见 [MLU-OPS™算子编译、开发与测试介绍](docs/MLU-OPS-Compile-Develop-And-Test.md)。 From 8fd98bc3d40edd7a0ea8eee9fde68d5a492b9eea Mon Sep 17 00:00:00 2001 From: liuduanhui <103939338+DanieeelLiu@users.noreply.github.com> Date: Mon, 30 Dec 2024 15:12:38 +0800 Subject: [PATCH 2/3] [Fix](mlu-ops): Remove deprecated instruction for 4.0 (#1187) (#1188) --- core/runtime/device.h | 2 +- .../moe_dispatch_backward_gate.md | 4 +- .../ms_deform_attn_backward.md | 4 +- .../points_in_boxes/points_in_boxes.md | 2 +- .../prior_box/prior_box_design_doc.md | 2 +- .../roipoint_pool3d/roipoint_pool3d.md | 2 +- kernels/ball_query/ball_query_union1.mlu | 2 +- .../box_iou_rotated/box_iou_rotated_aligned.h | 7 +- .../box_iou_rotated_nonaligned.h | 9 +- .../box_iou_rotated/box_iou_rotated_utils.h | 10 +- kernels/carafe/carafe_block.mlu | 16 +-- ...ou_rotated_sort_vertices_forward_block.mlu | 2 +- kernels/lgamma/lgamma_block.mlu | 2 +- kernels/logspace/logspace.cpp | 2 +- kernels/logspace/logspace_block.mlu | 22 +++-- .../moe_dispatch_backward_gate_union1.mlu | 4 +- ...rm_attn_backward_small_channels_union1.mlu | 14 +-- .../ms_deform_attn_backward_union1.mlu | 17 ++-- .../msda_forward_small_channel_union1.mlu | 2 +- ...l_information_backward_3pipeline_block.mlu | 9 +- kernels/nms_rotated/nms_rotated_union1.mlu | 10 +- kernels/nms_rotated/nms_utils.h | 2 +- kernels/prior_box/prior_box_block.mlu | 4 +- .../roi_align_rotated_forward_vector.mlu | 2 +- .../roipoint_pool3d_union1.mlu | 7 +- ...roipoint_pool3d_union1_large_boxes_num.mlu | 4 +- .../three_interpolate_union1.mlu | 22 +++-- kernels/tin_shift/tin_shift_union1.mlu | 4 +- kernels/voxelization/voxelization_kernel.mlu | 2 +- kernels/yolo_box/yolo_box_block.mlu | 98 +++++++++---------- test/mlu_op_gtest/pb_gtest/include/runtime.h | 2 +- test/mlu_op_gtest/pb_gtest/src/executor.cpp | 6 +- .../pb_gtest/src/hardware_monitor.cpp | 60 ------------ 33 files changed, 156 insertions(+), 201 deletions(-) diff --git a/core/runtime/device.h b/core/runtime/device.h index fe8aed516..5bf367b1e 100644 --- a/core/runtime/device.h +++ b/core/runtime/device.h @@ -119,7 +119,7 @@ inline int32_t getClusterNumberOfJobLimitCapability(mluOpHandle_t handle) { inline cnrtFunctionType_t castCnKernelClassToCnrtFuncType(KernelClass jobType) { switch (jobType) { default: - return CNRT_FUNC_TYPE_MUTABLE; + return cnrtFuncTypeMutable; case CN_KERNEL_CLASS_BLOCK: return cnrtFuncTypeBlock; case CN_KERNEL_CLASS_UNION: diff --git a/docs/design_docs/moe_dispatch_backward_gate/moe_dispatch_backward_gate.md b/docs/design_docs/moe_dispatch_backward_gate/moe_dispatch_backward_gate.md index aac151304..1a5fec426 100644 --- a/docs/design_docs/moe_dispatch_backward_gate/moe_dispatch_backward_gate.md +++ b/docs/design_docs/moe_dispatch_backward_gate/moe_dispatch_backward_gate.md @@ -476,7 +476,7 @@ for (int i = 0; i < samples; ++i) { int sample_idx = 0; // 从 workspace load所有中间计算结果 T *nram_grad_gates = (T *)nram_buffer; - __bang_write_zero(nram_grad_gates, samples); + __bang_write_value(nram_grad_gates, samples, (T)0); for (int ti = 0; ti < taskDim; ti++) { if ((rem_task > 0) && (ti < (one_sample_task_num + 1) * rem_task)) { sample_idx = (int)(ti / (one_sample_task_num + 1)); @@ -567,7 +567,7 @@ for (int i = 0; i < samples; ++i) { // 复用nram_location空间 T *nram_grad_gates = (T*)nram_location; - __bang_write_zero(nram_grad_gates, deal_s_num); + __bang_write_value(nram_grad_gates, deal_s_num, 0); // 三级流水计算过程 // step4 diff --git a/docs/design_docs/ms_deform_attn_backward/ms_deform_attn_backward.md b/docs/design_docs/ms_deform_attn_backward/ms_deform_attn_backward.md index 5dc407284..2002b873e 100644 --- a/docs/design_docs/ms_deform_attn_backward/ms_deform_attn_backward.md +++ b/docs/design_docs/ms_deform_attn_backward/ms_deform_attn_backward.md @@ -264,8 +264,8 @@ void msDeformAttnCol2imBilinear(){ __memcpy(top_grad, grad_output, deal_num * sizeof(T), GDRAM2NRAM); __bang_mul_scalar(top_grad_temp, top_grad, attn_weight, deal_num); - __bang_write_zero(grad_h_weight, deal_num); - __bang_write_zero(grad_w_weight, deal_num); + __bang_write_value(grad_h_weight, deal_num, 0); + __bang_write_value(grad_w_weight, deal_num, 0); if (h_low >= 0 && w_low >= 0) { const int32_t offset1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr; __memcpy(grad_output_nram, grad_output + offset1, deal_num * sizeof(T), GDRAM2NRAM); diff --git a/docs/design_docs/points_in_boxes/points_in_boxes.md b/docs/design_docs/points_in_boxes/points_in_boxes.md index 1f6b83a83..20fe47b0c 100644 --- a/docs/design_docs/points_in_boxes/points_in_boxes.md +++ b/docs/design_docs/points_in_boxes/points_in_boxes.md @@ -303,7 +303,7 @@ void points_in_boxes_kernel(int batch_size, int boxes_num, int pts_num, const fl X = points[0]; Y = points[m]; Z = points[2*m]; - bang_write_zero(last, 0); + bang_write_value(last, 0, 0); loop boxes for t in range(boxes_num): boxes = b * T * 7 + t * 7; (cx, cy, cz, dx, dy, dz, rz) = boxes[0:7]; diff --git a/docs/design_docs/prior_box/prior_box_design_doc.md b/docs/design_docs/prior_box/prior_box_design_doc.md index 9068cfd81..9ed64ec0a 100644 --- a/docs/design_docs/prior_box/prior_box_design_doc.md +++ b/docs/design_docs/prior_box/prior_box_design_doc.md @@ -394,7 +394,7 @@ mluOpPriorBox(mluOpHandle_t handle, 对`one_loop_pixel_num`循环处理,一次循环只初始化一个点的`num_priors`个框的坐标,设该点在`feature_map`上的索引为`pixel_index` - - 调用__bang_write_zero()将`boxes`置为0,即 boxes = [0,0,0,0,0,0,0,0] + - 调用__bang_write_value()将`boxes`置为0,即 boxes = [0,0,0,0,0,0,0,0] - 计算当前处理的点的位置,x_index = pixel_index % width,y_index = pixel_index / width,图中x_index = 0,1,y_index = 0,1,2。 - 将`x_index`(上图中x的坐标)和`x_mask`相乘,得到`tmp_x`(tmp_x为x_index,x_mask相乘的结果),tmp_x = [x_index,0,x_index,0,x_index,0,x_index,0] diff --git a/docs/design_docs/roipoint_pool3d/roipoint_pool3d.md b/docs/design_docs/roipoint_pool3d/roipoint_pool3d.md index 60d1f2d98..965d3eb09 100644 --- a/docs/design_docs/roipoint_pool3d/roipoint_pool3d.md +++ b/docs/design_docs/roipoint_pool3d/roipoint_pool3d.md @@ -281,7 +281,7 @@ void roipoint_pool3d_union1(const int batch_size, __memcpy_async(ping_input2, points_y_start + (bs_idx * pts_num) * sizeof(T), span_num_deal_size, GDRAM2NRAM); __memcpy_async(ping_input3, points_z_start + (bs_idx * pts_num) * sizeof(T), span_num_deal_size, GDRAM2NRAM); __memcpy_async(point_features, point_features_start, span_num_deal_size, GDRAM2NRAM); - __bang_write_zero((T *)cnt, boxes_num); + __bang_write_value((T *)cnt, boxes_num, (T)0);; size_t box_start = bs_idx == batch_start ? first_batch_box_start : 0; size_t box_end = bs_idx == batch_end ? last_batch_box_end : boxes_num; diff --git a/kernels/ball_query/ball_query_union1.mlu b/kernels/ball_query/ball_query_union1.mlu index 33c4b2bca..e7c1daa85 100644 --- a/kernels/ball_query/ball_query_union1.mlu +++ b/kernels/ball_query/ball_query_union1.mlu @@ -121,7 +121,7 @@ __mlu_func__ void ballQueryWorkflow( T *new_xyz_nram = vec_new_x1; __memcpy(new_xyz_nram, &new_xyz[base1], num_deal_new_xyz * 3 * sizeof(T), GDRAM2NRAM); - __bang_write_zero(vec_idx_num, num_stride); + __bang_write_value(vec_idx_num, num_stride, (int32_t)0); for (uint32_t new_index = index_new_xyz; new_index < (index_new_xyz + num_deal_new_xyz);) { diff --git a/kernels/box_iou_rotated/box_iou_rotated_aligned.h b/kernels/box_iou_rotated/box_iou_rotated_aligned.h index 67a948ffe..7bb94f192 100644 --- a/kernels/box_iou_rotated/box_iou_rotated_aligned.h +++ b/kernels/box_iou_rotated/box_iou_rotated_aligned.h @@ -62,7 +62,8 @@ __mlu_func__ void MLUUnion1BoxIouRotatedAligned(const T *box1, const T *box2, const uint32_t max_box_pair = FLOOR_ALIGN(MAX_NRAM_SIZE / copies_of_nram, COMPUTE_COUNT_ALIGN); // First, initialize ram with all 0, or could cause nan/inf unexcepted results - __bang_write_zero((uint8_t *)nram_buffer, copies_of_nram * max_box_pair); + __bang_write_value((uint8_t *)nram_buffer, copies_of_nram * max_box_pair, + (uint8_t)0); void *box1_trans = nram_buffer + 4 * max_box_pair * sizeof(T); void *box2_trans = @@ -224,8 +225,8 @@ __mlu_func__ void MLUUnion1BoxIouRotatedAligned(const T *box1, const T *box2, (T *)temp4_ram, (T *)temp5_ram, actual_compute_box_num); // initialize valid_pts, nums_in - __bang_write_zero((T *)valid_pts, 24 * actual_compute_box_num); - __bang_write_zero((T *)nums_in_ram, actual_compute_box_num); + __bang_write_value((T *)valid_pts, 24 * actual_compute_box_num, (T)0); + __bang_write_value((T *)nums_in_ram, actual_compute_box_num, (T)0); // 3. Get all intersection points getIntersectionPoints( diff --git a/kernels/box_iou_rotated/box_iou_rotated_nonaligned.h b/kernels/box_iou_rotated/box_iou_rotated_nonaligned.h index ec44b3abd..82db7292b 100644 --- a/kernels/box_iou_rotated/box_iou_rotated_nonaligned.h +++ b/kernels/box_iou_rotated/box_iou_rotated_nonaligned.h @@ -68,7 +68,8 @@ __mlu_func__ void MLUUnion1BoxIouRotatedNonAligned(const T *box1, const T *box2, const uint32_t max_box_pair = FLOOR_ALIGN(MAX_NRAM_SIZE / copies_of_nram, COMPUTE_COUNT_ALIGN); // First, initialize ram with all 0, or could cause nan/inf unexcepted results - __bang_write_zero((uint8_t *)nram_buffer, copies_of_nram * max_box_pair); + __bang_write_value((uint8_t *)nram_buffer, copies_of_nram * max_box_pair, + (uint8_t)0); void *box1_onchip = nram_buffer + 2 * max_box_pair * sizeof(T); void *box2_onchip = @@ -190,7 +191,7 @@ __mlu_func__ void MLUUnion1BoxIouRotatedNonAligned(const T *box1, const T *box2, const T area_thres = 1e-14; if (area1 < area_thres) { // set all current box-paires ious to zeros - __bang_write_zero((T *)ious_ram, actual_compute_box_num); + __bang_write_value((T *)ious_ram, actual_compute_box_num, (T)0); __memcpy(ious + current_ious_offset, (T *)ious_ram, actual_box2_num * sizeof(T), NRAM2GDRAM); continue; @@ -309,8 +310,8 @@ __mlu_func__ void MLUUnion1BoxIouRotatedNonAligned(const T *box1, const T *box2, (T *)temp3_ram, (T *)temp4_ram, actual_compute_box_num); - __bang_write_zero((T *)valid_pts, 24 * actual_compute_box_num); - __bang_write_zero((T *)nums_in_ram, actual_compute_box_num); + __bang_write_value((T *)valid_pts, 24 * actual_compute_box_num, (T)0); + __bang_write_value((T *)nums_in_ram, actual_compute_box_num, (T)0); // 3. Get all intersection points getIntersectionPoints( diff --git a/kernels/box_iou_rotated/box_iou_rotated_utils.h b/kernels/box_iou_rotated/box_iou_rotated_utils.h index 22aa3e0ec..e9194ef52 100644 --- a/kernels/box_iou_rotated/box_iou_rotated_utils.h +++ b/kernels/box_iou_rotated/box_iou_rotated_utils.h @@ -455,8 +455,8 @@ __mlu_func__ void convexHullGraham( // if all of boxes are invalid, just return. int valid_box_count = __bang_count((T *)valid_box, real_compute_box_num); if (!valid_box_count) { - __bang_write_value((T *)ordered_pts_x, total_points, 0); - __bang_write_value((T *)ordered_pts_y, total_points, 0); + __bang_write_value((T *)ordered_pts_x, total_points, (T)0); + __bang_write_value((T *)ordered_pts_y, total_points, (T)0); __bang_write_value((T *)valid_pts, actual_compute_box_num, (T)1); __bang_write_value((T *)valid_pts + actual_compute_box_num, total_points - actual_compute_box_num, (T)0); @@ -559,8 +559,8 @@ __mlu_func__ void convexHullGraham( // assign invalid value to temp1_ram(-2 < -1) and temp2_ram for sorting. __bang_write_value((T *)temp1_ram, actual_compute_box_num, (T)-2); __bang_write_value((T *)temp2_ram, actual_compute_box_num, (T)0); - __bang_write_value((T *)ordered_pts_x, total_points, 0); - __bang_write_value((T *)ordered_pts_y, total_points, 0); + __bang_write_value((T *)ordered_pts_x, total_points, (T)0); + __bang_write_value((T *)ordered_pts_y, total_points, (T)0); // get the offset of each max value according to the channel __mluop_get_stage_indices_tfuse((int *)temp3_ram, actual_compute_box_num); @@ -783,7 +783,7 @@ __mlu_func__ void polygonArea(T *ordered_pts_x, T *ordered_pts_y, T *valid_box, actual_compute_box_num); // temp1 = area, initialize with all 0 - __bang_write_zero((T *)temp1_ram, actual_compute_box_num); + __bang_write_value((T *)temp1_ram, actual_compute_box_num, (T)0); __bang_argmax((T *)temp6_ram, (T *)nums_in_ram, actual_compute_box_num); // temp_nums_in = max(nums_in) diff --git a/kernels/carafe/carafe_block.mlu b/kernels/carafe/carafe_block.mlu index c84572001..74eaed328 100644 --- a/kernels/carafe/carafe_block.mlu +++ b/kernels/carafe/carafe_block.mlu @@ -333,17 +333,17 @@ __mlu_global__ void MLUKernelCarafeBackward(T *input, T *mask, T *grad_output, h_k * wo * group * k_up * k_up + w_k * group * k_up * k_up + group_k * k_up * k_up; - __bang_write_zero((T *)nram_buf + 2 * NRAM_BLOCK / sizeof(T), - NRAM_BLOCK / sizeof(T)); - __bang_write_zero((T *)nram_buf + 4 * NRAM_BLOCK / sizeof(T), - NRAM_BLOCK / sizeof(T)); - __bang_write_zero((T *)nram_buf + 3 * NRAM_BLOCK / sizeof(T), - NRAM_BLOCK / sizeof(T)); + __bang_write_value((T *)nram_buf + 2 * NRAM_BLOCK / sizeof(T), + NRAM_BLOCK / sizeof(T), (T)0); + __bang_write_value((T *)nram_buf + 4 * NRAM_BLOCK / sizeof(T), + NRAM_BLOCK / sizeof(T), (T)0); + __bang_write_value((T *)nram_buf + 3 * NRAM_BLOCK / sizeof(T), + NRAM_BLOCK / sizeof(T), (T)0); __memcpy((T *)nram_buf + NRAM_BLOCK / sizeof(T), (T *)base_mask, k_up * k_up * sizeof(T), GDRAM2NRAM); for (int i = 0; i < num_per_loop; i++) { - __bang_write_zero((T *)nram_buf, NRAM_BLOCK / sizeof(T)); + __bang_write_value((T *)nram_buf, NRAM_BLOCK / sizeof(T), (T)0); T *base_grad_output = (T *)grad_output + n_k * ho * wo * c + h_k * wo * c + w_k * c + group_k * group_size + i * num_align; @@ -386,7 +386,7 @@ __mlu_global__ void MLUKernelCarafeBackward(T *input, T *mask, T *grad_output, } } if (rem_for_loop) { - __bang_write_zero((T *)nram_buf, NRAM_BLOCK / sizeof(T)); + __bang_write_value((T *)nram_buf, NRAM_BLOCK / sizeof(T), (T)0); T *base_grad_output = (T *)grad_output + n_k * ho * wo * c + h_k * wo * c + w_k * c + group_k * group_size + num_per_loop * num_align; diff --git a/kernels/diff_iou_rotated_sort_vertices_forward/diff_iou_rotated_sort_vertices_forward_block.mlu b/kernels/diff_iou_rotated_sort_vertices_forward/diff_iou_rotated_sort_vertices_forward_block.mlu index d9bb2bffa..e18e654b5 100644 --- a/kernels/diff_iou_rotated_sort_vertices_forward/diff_iou_rotated_sort_vertices_forward_block.mlu +++ b/kernels/diff_iou_rotated_sort_vertices_forward/diff_iou_rotated_sort_vertices_forward_block.mlu @@ -132,7 +132,7 @@ static __mlu_func__ void compute(int8_t *nram_vertices, int8_t *nram_mask, deal_num * dim_m); // preprocess to get pad index - __bang_write_zero(nram_temp0, dim_m); + __bang_write_value(nram_temp0, dim_m, (T)0); __bang_write_value(nram_temp0, INTERSECTION_OFFSET, (T)1.0); __bang_int82float(nram_pad, (int8_t *)(nram_mask_p), deal_num * dim_m, 0); __bang_cycle_maxequal(nram_pad, nram_pad, nram_temp0, deal_num * dim_m, diff --git a/kernels/lgamma/lgamma_block.mlu b/kernels/lgamma/lgamma_block.mlu index f3366189c..2383f96d4 100644 --- a/kernels/lgamma/lgamma_block.mlu +++ b/kernels/lgamma/lgamma_block.mlu @@ -161,7 +161,7 @@ __mlu_func__ void calcLgamma(float *buf0, float *buf1, float *buf2, float *buf3, * reflection_denom - lgamma_x : -reflection_denom; */ // using buf3 -> reflection - __bang_write_zero(buf4, num_deal); + __bang_write_value(buf4, num_deal, (float)0); __bang_sub(buf2, buf4, buf2, num_deal); isFinite(buf4, buf2, num_deal); __bang_sub(buf3, buf2, buf1, num_deal); diff --git a/kernels/logspace/logspace.cpp b/kernels/logspace/logspace.cpp index d898dd65f..421a7c5b4 100644 --- a/kernels/logspace/logspace.cpp +++ b/kernels/logspace/logspace.cpp @@ -31,7 +31,7 @@ static void LogspacePolicyFunc(const mluOpHandle_t &handle, const int64_t steps, cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type) { - *k_type = CNRT_FUNC_TYPE_BLOCK; + *k_type = cnrtFuncTypeBlock; uint32_t cluster_num = mluop::runtime::getCoreNumOfEachUnionCapability(handle); uint32_t core_in_cluster = handle->core_num_per_cluster; diff --git a/kernels/logspace/logspace_block.mlu b/kernels/logspace/logspace_block.mlu index 625597f24..5e24191cb 100644 --- a/kernels/logspace/logspace_block.mlu +++ b/kernels/logspace/logspace_block.mlu @@ -60,16 +60,16 @@ __mlu_func__ void float2DifferentType(float *result_float, T *result, __bang_lut((int16_t *)result_ge_half_max, (uint16_t *)result_ge_half_max, (int16_t *)table_half_all1, num, LUT_TABEL_LENGTH); __bang_float2half_tz((half *)result, result_float, num); - __bang_bor((int16_t *)result, (int16_t *)result, - (int16_t *)result_ge_half_max, num); + __bang_bor((int8_t *)result, (int8_t *)result, (int8_t *)result_ge_half_max, + 2 * num); __bang_ge_scalar((int16_t *)result_ge_half_max, (int16_t *)result_ge_half_max, 1, num); __nram__ int16_t table_half_inf[LUT_TABEL_LENGTH] = {(int16_t)0xffff, (int16_t)0xfc00}; __bang_lut((int16_t *)result_ge_half_max, (uint16_t *)result_ge_half_max, (int16_t *)table_half_inf, num, LUT_TABEL_LENGTH); - __bang_band((int16_t *)result, (int16_t *)result, - (int16_t *)result_ge_half_max, num); + __bang_band((int8_t *)result, (int8_t *)result, + (int8_t *)result_ge_half_max, 2 * num); } if (std::is_same::value) { __cn_vector_cast_f32_to_s32(num, (int *)result, result_float); @@ -236,8 +236,10 @@ __mlu_func__ void dealBaseNegative(const float start, const float end, __bang_float2int32((int *)floor_y, floor_y, actual_deal_num, 0); __bang_move(y_copy, log2_result, sizeof(float) * actual_deal_num); __bang_float2int32((int *)y_copy, y_copy, actual_deal_num, 0); - __bang_band((int *)y_copy, (int *)y_copy, all_int_1, actual_deal_num); - __bang_band((int *)y_copy, (int *)y_copy, (int *)floor_y, actual_deal_num); + __bang_band((int8_t *)y_copy, (int8_t *)y_copy, (int8_t *)all_int_1, + 4 * actual_deal_num); + __bang_band((int8_t *)y_copy, (int8_t *)y_copy, (int8_t *)floor_y, + 4 * actual_deal_num); __nram__ uint32_t table_for_odd_or_even_power[LUT_TABEL_LENGTH] = { 0, 0x80000000}; __bang_lut((int32_t *)y_copy, (uint32_t *)y_copy, @@ -247,12 +249,12 @@ __mlu_func__ void dealBaseNegative(const float start, const float end, __bang_lut((int32_t *)floor_y, (uint32_t *)floor_y, (int32_t *)table_for_integer_power, actual_deal_num, LUT_TABEL_LENGTH); - __bang_bor((int *)log2_result, (int *)log2_result, (int *)floor_y, - actual_deal_num); + __bang_bor((int8_t *)log2_result, (int8_t *)log2_result, (int8_t *)floor_y, + 4 * actual_deal_num); __bang_mul_scalar(log2_result, log2_result, base_log, actual_deal_num); __bang_pow2(result_float, log2_result, actual_deal_num); - __bang_bor((int *)result_float, (int *)result_float, (int *)y_copy, - actual_deal_num); + __bang_bor((int8_t *)result_float, (int8_t *)result_float, (int8_t *)y_copy, + 4 * actual_deal_num); float2DifferentType(result_float, result, actual_deal_num); __memcpy(res + loop_offset, result, actual_deal_num * sizeof(T), NRAM2GDRAM); diff --git a/kernels/moe_dispatch/moe_dispatch_backward_gate/moe_dispatch_backward_gate_union1.mlu b/kernels/moe_dispatch/moe_dispatch_backward_gate/moe_dispatch_backward_gate_union1.mlu index e8e9ad0eb..734d680cc 100644 --- a/kernels/moe_dispatch/moe_dispatch_backward_gate/moe_dispatch_backward_gate_union1.mlu +++ b/kernels/moe_dispatch/moe_dispatch_backward_gate/moe_dispatch_backward_gate_union1.mlu @@ -183,7 +183,7 @@ __mlu_global__ void MLUKernelMoeDispatchBwdGate1( if ((samples < taskDim) && (taskId == 0)) { T *nram_grad_gates = (T *)nram_buffer; - __bang_write_zero(nram_grad_gates, samples); + __bang_write_value(nram_grad_gates, samples, (T)0); if (samples > 1) { int one_sample_task_num = taskDim / samples; @@ -285,7 +285,7 @@ __mlu_global__ void MLUKernelMoeDispatchBwdGate2( __bang_and(nram_mask, nram_mask, nram_indices, deal_s_num); T *nram_grad_gates = (T *)nram_indices; - __bang_write_zero(nram_grad_gates, deal_s_num); + __bang_write_value(nram_grad_gates, deal_s_num, 0); if (deal_s_num > 1) { T *base_dispatch_addr = (T *)dispatch; diff --git a/kernels/ms_deform_attn/ms_deform_attn_backward/ms_deform_attn_backward_small_channels_union1.mlu b/kernels/ms_deform_attn/ms_deform_attn_backward/ms_deform_attn_backward_small_channels_union1.mlu index 9ff2a72e8..3795f41f0 100644 --- a/kernels/ms_deform_attn/ms_deform_attn_backward/ms_deform_attn_backward_small_channels_union1.mlu +++ b/kernels/ms_deform_attn/ms_deform_attn_backward/ms_deform_attn_backward_small_channels_union1.mlu @@ -370,7 +370,7 @@ void __mlu_func__ loadValue( b_col * spatial_size * qid_stride + level_start_id * qid_stride; } #endif - __bang_write_zero(grad_temp1, deal_num_real * num_deal_grid); + __bang_write_value(grad_temp1, deal_num_real * num_deal_grid, (float)0); __bang_cycle_add(grad_temp1, grad_temp1, mask2, deal_num_real * num_deal_grid, num_deal_grid); __bang_transpose(grad_temp3, grad_temp1, deal_num_real, num_deal_grid); @@ -379,7 +379,7 @@ void __mlu_func__ loadValue( num_deal_grid * deal_num_real, 0); __bang_lut((int32_t *)grad_temp3, (uint32_t *)grad_temp3, (int32_t *)table, num_deal_grid * deal_num_real, 64); - __bang_write_zero(grad_temp1, deal_num_real * num_deal_grid); + __bang_write_value(grad_temp1, deal_num_real * num_deal_grid, (float)0); __bang_cycle_add(grad_temp1, grad_temp1, mask1, deal_num_real * num_deal_grid, num_deal_grid); __sync_io_move_compute(); @@ -397,7 +397,7 @@ void __mlu_func__ loadValue( (int8_t *)grad_temp3, num_deal_grid * deal_num_real * sizeof(float)); - __bang_write_zero(grad_temp1, deal_num_real * num_deal_grid); + __bang_write_value(grad_temp1, deal_num_real * num_deal_grid, (float)0); __bang_cycle_add(grad_temp1, grad_temp1, mask4, deal_num_real * num_deal_grid, num_deal_grid); __bang_transpose(grad_temp3, grad_temp1, deal_num_real, num_deal_grid); @@ -410,7 +410,7 @@ void __mlu_func__ loadValue( (int8_t *)grad_temp3, num_deal_grid * deal_num_real * sizeof(float)); - __bang_write_zero(grad_temp1, deal_num_real * num_deal_grid); + __bang_write_value(grad_temp1, deal_num_real * num_deal_grid, (float)0); __bang_cycle_add(grad_temp1, grad_temp1, mask3, deal_num_real * num_deal_grid, num_deal_grid); __bang_transpose(grad_temp3, grad_temp1, deal_num_real, num_deal_grid); @@ -438,7 +438,7 @@ void __mlu_func__ computeGradValue( float *nram_grid_offset2, const int32_t &batch, float *nram_grad_output_tl, float *nram_grad_output_tr, float *nram_grad_output_bl, float *nram_grad_output_br, float *nram_grad_weight) { - __bang_write_zero(grad_temp1, deal_num_real * num_deal_grid); + __bang_write_value(grad_temp1, deal_num_real * num_deal_grid, (float)0); __bang_cycle_add(grad_temp1, grad_temp1, nram_grad_weight, deal_num_real * num_deal_grid, num_deal_grid); __bang_transpose(grad_temp3, grad_temp1, @@ -600,7 +600,7 @@ void __mlu_func__ computeGradAttnWeight( const int32_t &num_per_time_real, const int32_t &num_heads, const int32_t &num_levels, const int32_t &num_points, const int32_t &grid_offset, float *nram_h_high_temp) { - __bang_write_zero(grad_w_weight, 2 * offset_nram); + __bang_write_value(grad_w_weight, 2 * offset_nram, (float)0); // grad_output_nram_tl __bang_transpose(grad_weight, nram_grad_output_tl, num_deal_grid, deal_num_real); @@ -714,7 +714,7 @@ void __mlu_func__ computeGradSampingLoc( num_points * deal_num_real, num_per_time_real * num_heads * num_levels); - __bang_write_zero(grad_temp1, num_deal_grid * deal_num_real); + __bang_write_value(grad_temp1, num_deal_grid * deal_num_real, (float)0); __bang_cycle_add(grad_temp1, grad_temp1, nram_grad_weight, num_deal_grid * deal_num_real, num_deal_grid); __bang_transpose(nram_grad_output_tr, grad_temp1, diff --git a/kernels/ms_deform_attn/ms_deform_attn_backward/ms_deform_attn_backward_union1.mlu b/kernels/ms_deform_attn/ms_deform_attn_backward/ms_deform_attn_backward_union1.mlu index 66505f6c0..774b588d9 100644 --- a/kernels/ms_deform_attn/ms_deform_attn_backward/ms_deform_attn_backward_union1.mlu +++ b/kernels/ms_deform_attn/ms_deform_attn_backward/ms_deform_attn_backward_union1.mlu @@ -231,9 +231,11 @@ __mlu_global__ void MLUUnion1KernelMsDeformAttnBackwardDefault( const float w4 = lh * lw; if (likely(C_tail != 0)) { const int32_t base_ptr = m_col * channels + C_repeat * deal_num; - __bang_write_zero(grad_h_weight, PAD_UP(channels, ALIGN_NUM)); - __bang_write_zero(grad_w_weight, PAD_UP(channels, ALIGN_NUM)); - __bang_write_zero(grad_output_nram, PAD_UP(channels, ALIGN_NUM)); + __bang_write_value(grad_h_weight, PAD_UP(channels, ALIGN_NUM), + (float)0); + __bang_write_value(grad_w_weight, PAD_UP(channels, ALIGN_NUM), + (float)0); + __bang_write_value(grad_output_nram, PAD_UP(channels, ALIGN_NUM), 0); __memcpy(top_grad, grad_output + grad_output_offset + C_repeat * deal_num, @@ -250,9 +252,12 @@ __mlu_global__ void MLUUnion1KernelMsDeformAttnBackwardDefault( } for (int32_t C_loop = 0; C_loop < C_repeat; ++C_loop) { const int32_t base_ptr = m_col * channels + C_loop * deal_num; - __bang_write_zero(grad_h_weight, PAD_UP(channels, ALIGN_NUM)); - __bang_write_zero(grad_w_weight, PAD_UP(channels, ALIGN_NUM)); - __bang_write_zero(grad_output_nram, PAD_UP(channels, ALIGN_NUM)); + __bang_write_value(grad_h_weight, PAD_UP(channels, ALIGN_NUM), + (float)0); + __bang_write_value(grad_w_weight, PAD_UP(channels, ALIGN_NUM), + (float)0); + __bang_write_value(grad_output_nram, PAD_UP(channels, ALIGN_NUM), + (float)0); __memcpy(top_grad, grad_output + grad_output_offset + C_loop * deal_num, deal_num * LEN_FLOAT, GDRAM2NRAM); diff --git a/kernels/ms_deform_attn/ms_deform_attn_forward/msda_forward_small_channel_union1.mlu b/kernels/ms_deform_attn/ms_deform_attn_forward/msda_forward_small_channel_union1.mlu index e1abf634d..fa324ca94 100644 --- a/kernels/ms_deform_attn/ms_deform_attn_forward/msda_forward_small_channel_union1.mlu +++ b/kernels/ms_deform_attn/ms_deform_attn_forward/msda_forward_small_channel_union1.mlu @@ -418,7 +418,7 @@ __mlu_global__ void MLUKernelMsDeformAttnForwardSmallChannel( c_real_num = c_rem; } } - __bang_write_zero((float *)input_tl, 4 * deal_num * channel); + __bang_write_value((float *)input_tl, 4 * deal_num * channel, (float)0); __sync(); // load data_value for (int32_t p_idx = 0; p_idx < io_data_num; ++p_idx) { diff --git a/kernels/mutual_information/mutual_information_backward/mutual_information_backward_3pipeline_block.mlu b/kernels/mutual_information/mutual_information_backward/mutual_information_backward_3pipeline_block.mlu index 11246d931..55a4ca8e7 100644 --- a/kernels/mutual_information/mutual_information_backward/mutual_information_backward_3pipeline_block.mlu +++ b/kernels/mutual_information/mutual_information_backward/mutual_information_backward_3pipeline_block.mlu @@ -116,11 +116,11 @@ __mlu_func__ void computePGrad(const int b, const int S, const int T, float *nram_cur_term2 = nram_cur_term1 + min_len + 1; float *nram_cur_p_grad = nram_cur_term2 + min_len; - __bang_write_zero(nram_cur_term1, 3 * min_len + 1); + __bang_write_value(nram_cur_term1, 3 * min_len + 1, (float)0); // compute the last one: p_grad[b][s_end][t_end] = ans_grad[b] - __memcpy(nram_p_grad + s_end * (T + 1) + t_end, ans_grad + b, - sizeof(float), GDRAM2NRAM); + __memcpy(nram_p_grad + s_end * (T + 1) + t_end, ans_grad + b, sizeof(float), + GDRAM2NRAM); nram_cur_p_grad[0] = nram_p_grad[s_end * (T + 1) + t_end]; int data_num = 0; @@ -242,7 +242,8 @@ __mlu_global__ void mluBlock3PipelineMutualInformationBackward( t_begin = boundary[1]; s_end = boundary[2]; t_end = boundary[3]; - __bang_write_zero((float *)nram_buffer, S * (T + 1) + (S + 1) * T); + __bang_write_value((float *)nram_buffer, S * (T + 1) + (S + 1) * T, + (float)0); if (s_begin > s_end || t_begin > t_end) { if (S > 0) { diff --git a/kernels/nms_rotated/nms_rotated_union1.mlu b/kernels/nms_rotated/nms_rotated_union1.mlu index d384cce4b..de5ab33c1 100644 --- a/kernels/nms_rotated/nms_rotated_union1.mlu +++ b/kernels/nms_rotated/nms_rotated_union1.mlu @@ -150,7 +150,8 @@ __mlu_func__ void nms_detection( void *vec2_y = (float *)vec2_x + 4 * max_seg_iou_pad; // First, initialize ram with all 0, or could cause nan/inf unexcepted results - __bang_write_zero((uint8_t *)score, copies_of_nram * max_seg_iou_pad); + __bang_write_value((uint8_t *)score, copies_of_nram * max_seg_iou_pad, + (uint8_t)0); for (int keep = 0; keep < input_box_num; keep++) { __sync_cluster(); @@ -270,7 +271,8 @@ __mlu_func__ void nms_detection( // Initialize valid_box, set actual_box_num boxes to 1, else set to 0 __bang_write_value(((float *)valid_box), seg_len, 1.0f); if (cpy_len < seg_len) { - __bang_write_zero((float *)valid_box + cpy_len, seg_len - cpy_len); + __bang_write_value((float *)valid_box + cpy_len, seg_len - cpy_len, + (float)0); } // Each box data: x, y, w, h, a @@ -323,8 +325,8 @@ __mlu_func__ void nms_detection( (float *)temp3_ram, (float *)temp4_ram, seg_len); // initialize valid_pts, nums_in - __bang_write_zero((float *)valid_pts, 24 * seg_len); - __bang_write_zero((float *)nums_in_ram, seg_len); + __bang_write_value((float *)valid_pts, 24 * seg_len, (float)0); + __bang_write_value((float *)nums_in_ram, seg_len, (float)0); // 3. Get all intersection points getIntersectionPoints( diff --git a/kernels/nms_rotated/nms_utils.h b/kernels/nms_rotated/nms_utils.h index 23a7b6434..c06cdc15c 100644 --- a/kernels/nms_rotated/nms_utils.h +++ b/kernels/nms_rotated/nms_utils.h @@ -840,7 +840,7 @@ __mlu_func__ void polygonArea(T *ordered_pts_x, T *ordered_pts_y, T *valid_box, actual_compute_box_num); // temp1 = area, initialize with all 0 - __bang_write_zero((T *)temp1_ram, actual_compute_box_num); + __bang_write_value((T *)temp1_ram, actual_compute_box_num, (T)0); __bang_argmax((T *)temp6_ram, (T *)nums_in_ram, actual_compute_box_num); // temp_nums_in = max(nums_in) diff --git a/kernels/prior_box/prior_box_block.mlu b/kernels/prior_box/prior_box_block.mlu index 6c3bc42bd..082965ad2 100644 --- a/kernels/prior_box/prior_box_block.mlu +++ b/kernels/prior_box/prior_box_block.mlu @@ -33,7 +33,7 @@ __nram__ int8_t nram_buffer[MAX_NRAM_SIZE]; template __mlu_func__ void generate_AbAb_Mask(T *a_mask, T a_index, T *b_mask, T b_index, T *tmp, T *result, const int align_num) { - __bang_write_zero(result, align_num); + __bang_write_value(result, align_num, (T)0); __bang_mul_scalar(tmp, a_mask, a_index, align_num); __bang_add(result, result, tmp, align_num); __bang_mul_scalar(tmp, b_mask, b_index, align_num); @@ -79,7 +79,7 @@ __mlu_global__ void mluKernelPriorVar(const int height, const int width, const int loop_pixel_num = pixel_end_index - pixel_begin_index; T *loop_gdram_ptr = var + pixel_begin_index * num_priors * 4; __gdramset(loop_gdram_ptr, loop_pixel_num * num_priors * 4, (T)0); - __bang_write_zero(var_nram, loop_pixel_num * one_var_size); + __bang_write_value(var_nram, loop_pixel_num * one_var_size, (T)0); __bang_cycle_add(var_nram, var_nram, variances_nram, loop_pixel_num * one_var_size, one_var_size); // memcpy to gdram diff --git a/kernels/roi_align_rotated/roi_align_rotated_forward_vector.mlu b/kernels/roi_align_rotated/roi_align_rotated_forward_vector.mlu index 5cc589956..2f33e9032 100644 --- a/kernels/roi_align_rotated/roi_align_rotated_forward_vector.mlu +++ b/kernels/roi_align_rotated/roi_align_rotated_forward_vector.mlu @@ -433,7 +433,7 @@ __mlu_global__ void roiAlignRotatedForward( if (cur_cache_c + c_cache_i > channels) { cur_cache_c = channels - c_cache_i; } - __bang_write_zero(output_channels, cur_cache_c); + __bang_write_value(output_channels, cur_cache_c, (T)0); for (uint32_t h_idx = 0; h_idx < roi_bin_grid_h; h_idx += bin_order_num) { uint32_t deal_bin_h = bin_order_num; diff --git a/kernels/roipoint_pool3d/roipoint_pool3d_union1.mlu b/kernels/roipoint_pool3d/roipoint_pool3d_union1.mlu index ab06ec577..0035ce1f8 100644 --- a/kernels/roipoint_pool3d/roipoint_pool3d_union1.mlu +++ b/kernels/roipoint_pool3d/roipoint_pool3d_union1.mlu @@ -217,8 +217,8 @@ __mlu_func__ void computeStoreLastBlockRoipointPool3d( int rem = (sampled_pts_num * (3 + feature_in_len)) % (auxiliary_num_deal * 6); // use auxiliary_a to auxiliary_f - __bang_write_zero((T *)auxiliary_a, - PAD_UP(auxiliary_num_deal * 6, NFU_ALIGN_SIZE)); + __bang_write_value((T *)auxiliary_a, + PAD_UP(auxiliary_num_deal * 6, NFU_ALIGN_SIZE), (T)0); if (repeat > 0) { __memcpy(pooled_features_gdram + box_idx * sampled_pts_num * (3 + feature_in_len) * sizeof(T), @@ -377,7 +377,8 @@ __mlu_global__ void MLUKernelRoipointPool3d( for (int bs_idx = batch_start; bs_idx <= batch_end; bs_idx++) { __memcpy_async(boxes3d, boxes3d_gdram + bs_idx * boxes_num * 7 * sizeof(T), boxes_num * 7 * sizeof(T), GDRAM2NRAM); - __bang_write_zero((int *)cnt, PAD_UP(boxes_num, NFU_ALIGN_SIZE)); + __bang_write_value((int *)cnt, PAD_UP(boxes_num, NFU_ALIGN_SIZE), + (int32_t)0); const int8_t *points_x_start = points_x_gdram + bs_idx * pts_num * sizeof(T); diff --git a/kernels/roipoint_pool3d/roipoint_pool3d_union1_large_boxes_num.mlu b/kernels/roipoint_pool3d/roipoint_pool3d_union1_large_boxes_num.mlu index fe2631d95..7e68a20cb 100644 --- a/kernels/roipoint_pool3d/roipoint_pool3d_union1_large_boxes_num.mlu +++ b/kernels/roipoint_pool3d/roipoint_pool3d_union1_large_boxes_num.mlu @@ -213,8 +213,8 @@ __mlu_func__ void computeStoreLastBlockRoipointPool3d( int rem = (sampled_pts_num * (3 + feature_in_len)) % (auxiliary_num_deal * 6); // use auxiliary_a to auxiliary_f - __bang_write_zero((T *)auxiliary_a, - PAD_UP(auxiliary_num_deal * 6, NFU_ALIGN_SIZE)); + __bang_write_value((T *)auxiliary_a, + PAD_UP(auxiliary_num_deal * 6, NFU_ALIGN_SIZE), (T)0); if (repeat > 0) { __memcpy(pooled_features_gdram + box_idx * sampled_pts_num * (3 + feature_in_len) * sizeof(T), diff --git a/kernels/three_interpolate/three_interpolate_union1.mlu b/kernels/three_interpolate/three_interpolate_union1.mlu index 1d0ba2e0e..1d0c8aa36 100644 --- a/kernels/three_interpolate/three_interpolate_union1.mlu +++ b/kernels/three_interpolate/three_interpolate_union1.mlu @@ -307,7 +307,8 @@ __mlu_global__ void MLUKernelThreeInterpolateForward( // transpose the indices and weights for (uint32_t index = 0; index < INDEX_WEIGHT_LAST_DIM; ++index) { __bang_write_value(nram_indices_transpose + index * n_limit, n_limit, -1); - __bang_write_zero(nram_weights_transpose + index * n_limit, n_limit); + __bang_write_value(nram_weights_transpose + index * n_limit, n_limit, + (T)0); __memcpy(nram_indices_transpose + index * n_limit, nram_indices + index, sizeof(int32_t), NRAM2NRAM, sizeof(int32_t), INDEX_WEIGHT_LAST_DIM * sizeof(int32_t), actual_n_size - 1); @@ -330,7 +331,7 @@ __mlu_global__ void MLUKernelThreeInterpolateForward( MIN(CEIL_ALIGN(c_slice % c_limit, align_base_128), c_limit_new); } // 1.2 load Co*Mo features data - __bang_write_zero(nram_output, output_deal_size); + __bang_write_value(nram_output, output_deal_size, (T)0); uint32_t m_rem = m; for (uint32_t k = 0; k < m_repeated_times; ++k) { uint32_t m_slice = m_limit < m_rem ? m_limit : m_rem; @@ -355,8 +356,8 @@ __mlu_global__ void MLUKernelThreeInterpolateForward( m_slice, GDRAM2NRAM, m_limit_new, m, c_slice - 1); } // 2. Compute - __bang_write_zero(nram_features_transpose, - features_deal_size + c_limit); + __bang_write_value(nram_features_transpose, + features_deal_size + c_limit, (T)0); c_limit = c_limit_new; m_limit = m_limit_new; // 2.1 transpose features from Co*Mo to Mo*Co to easily select one whole @@ -366,8 +367,8 @@ __mlu_global__ void MLUKernelThreeInterpolateForward( uint32_t m_min = k * m_limit_org; uint32_t m_max = m_min + m_slice; for (uint32_t index = 0; index < INDEX_WEIGHT_LAST_DIM; ++index) { - __bang_write_zero(nram_features, output_deal_size); - __bang_write_zero(nram_features_selected, output_deal_size); + __bang_write_value(nram_features, output_deal_size, (T)0); + __bang_write_value(nram_features_selected, output_deal_size, (T)0); // 2.2 select the offset between the m_min and m_max // convert indices from int32_t to float if (m <= INT2FLOAT_KEEP_PRECISION_MAX_VALUE) { @@ -542,7 +543,7 @@ __mlu_global__ void MLUKernelThreeInterpolateBackward( MIN(CEIL_ALIGN(c_slice % c_limit, align_base_128), c_limit_new); } // initial the nram_grad_features with 0 - __bang_write_zero(nram_grad_features, grad_features_deal_size); + __bang_write_value(nram_grad_features, grad_features_deal_size, (T)0); uint32_t n_rem = n; for (uint32_t k = 0; k < n_repeated_times; ++k) { uint32_t n_slice = n_limit < n_rem ? n_limit : n_rem; @@ -579,7 +580,8 @@ __mlu_global__ void MLUKernelThreeInterpolateBackward( for (uint32_t index = 0; index < INDEX_WEIGHT_LAST_DIM; ++index) { __bang_write_value(nram_indices_transpose + index * n_limit, n_limit, -1); - __bang_write_zero(nram_weights_transpose + index * n_limit, n_limit); + __bang_write_value(nram_weights_transpose + index * n_limit, n_limit, + (T)0); __memcpy(nram_indices_transpose + index * n_limit_new, nram_indices + index, sizeof(int32_t), NRAM2NRAM, sizeof(int32_t), INDEX_WEIGHT_LAST_DIM * sizeof(int32_t), @@ -595,8 +597,8 @@ __mlu_global__ void MLUKernelThreeInterpolateBackward( // initial nram_grad_output_transpose with zero // and set extra c_limit size that will be selected by the index not in // [m_min, m_max) - __bang_write_zero(nram_grad_output_transpose, - grad_output_deal_size + c_limit); + __bang_write_value(nram_grad_output_transpose, + grad_output_deal_size + c_limit, (T)0); c_limit = c_limit_new; n_limit = n_limit_new; for (uint32_t index = 0; index < INDEX_WEIGHT_LAST_DIM; ++index) { diff --git a/kernels/tin_shift/tin_shift_union1.mlu b/kernels/tin_shift/tin_shift_union1.mlu index 95dc2216a..c34f79484 100644 --- a/kernels/tin_shift/tin_shift_union1.mlu +++ b/kernels/tin_shift/tin_shift_union1.mlu @@ -40,7 +40,7 @@ __mlu_func__ void mluMultiKernelTinShift( int t_shift = shifts[n_index * group_size + group_id]; int index = cur_channel_index % channel_size * hw_size + n_index * time_size * channel_size * hw_size; - __bang_write_value(data_nram, MAX_NRAM_SIZE, (char)0); + __bang_write_value(data_nram, MAX_NRAM_SIZE, (int8_t)0); __asm__ volatile("sync;"); if (abs(t_shift) >= time_size) { __memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM, @@ -123,7 +123,7 @@ __mlu_func__ void mluMultiKernelTinShiftSplitSequence( int next_sequence_index = index / hw_size / channel_size % time_size + segmentime_size; int cur_sequence_index = index / hw_size / channel_size % time_size; - __bang_write_value(data_nram, MAX_NRAM_SIZE, (char)0); + __bang_write_value(data_nram, MAX_NRAM_SIZE, (int8_t)0); __asm__ volatile("sync;"); if (max_number_hw_per_core == 0) { mluHwSplit(input, t_shift, time_size, hw_size, channel_size, index, diff --git a/kernels/voxelization/voxelization_kernel.mlu b/kernels/voxelization/voxelization_kernel.mlu index 9832ab4bf..82efbc1cd 100644 --- a/kernels/voxelization/voxelization_kernel.mlu +++ b/kernels/voxelization/voxelization_kernel.mlu @@ -502,7 +502,7 @@ __mlu_global__ void mluCalcPointsPerVoxel( // generate 0~deal_num indices. __mluop_get_stage_indices_tfuse(nram_base_offset, max_nram_count); - __bang_write_zero(nram_temp_zeros, max_nram_count); + __bang_write_value(nram_temp_zeros, max_nram_count, (int32_t)0); for (int32_t i = 0; i <= repeat; i++) { if (i == repeat && rem == 0) { break; diff --git a/kernels/yolo_box/yolo_box_block.mlu b/kernels/yolo_box/yolo_box_block.mlu index b258351ab..65a65fed3 100644 --- a/kernels/yolo_box/yolo_box_block.mlu +++ b/kernels/yolo_box/yolo_box_block.mlu @@ -217,7 +217,7 @@ __mlu_func__ void compute(T *nram_x, T *nram_y, T *nram_w, T *nram_h, if (clip_bbox == true) { // bx0 = bx0 > 0 ? bx0 : 0; // by0 = by0 > 0 ? by0 : 0; - __bang_write_zero(nram_conf_p, deal_num); + __bang_write_value(nram_conf_p, deal_num, (T)0); __bang_maxequal(nram_x_p, nram_conf_p, nram_x_p, deal_num); __bang_maxequal(nram_y_p, nram_conf_p, nram_y_p, deal_num); @@ -406,16 +406,16 @@ __mlu_func__ void YoloBoxComputeBbox( n_in, class_num, anchor_s, anchor_s, 0, c_in, hw_total_num, hw_seg_num, align_hw_seg_num, 0, 0); - __bang_write_zero(nram_cx, deal_num); - __bang_write_zero(nram_cy, deal_num); + __bang_write_value(nram_cx, deal_num, (T)0); + __bang_write_value(nram_cy, deal_num, (T)0); initCxyParam(nram_cx, nram_cy, n_in, anchor_s, w_in, hw_seg_num, align_hw_seg_num, hw_data_offset); - __bang_write_zero(nram_anchor_w, deal_num); - __bang_write_zero(nram_anchor_h, deal_num); + __bang_write_value(nram_anchor_w, deal_num, 0); + __bang_write_value(nram_anchor_h, deal_num, 0); initAnchorParam(anchors, nram_anchor_w, nram_anchor_h, n_in, anchor_s, anchor_s, 0, align_hw_seg_num); - __bang_write_zero(nram_img_w, deal_num); - __bang_write_zero(nram_img_h, deal_num); + __bang_write_value(nram_img_w, deal_num, 0); + __bang_write_value(nram_img_h, deal_num, 0); initImgParam(img_size, nram_img_w, nram_img_h, n_in, anchor_s, align_hw_seg_num); __sync(); @@ -440,12 +440,12 @@ __mlu_func__ void YoloBoxComputeBbox( T *base_addr_x = (T *)x + hw_data_offset; T *base_addr_boxes = (T *)boxes + hw_data_offset; - __bang_write_zero(nram_cx, deal_num); - __bang_write_zero(nram_cy, deal_num); + __bang_write_value(nram_cx, deal_num, (T)0); + __bang_write_value(nram_cy, deal_num, (T)0); initCxyParam(nram_cx, nram_cy, deal_n_num, anchor_s, w_in, hw_seg_num, align_hw_seg_num, hw_data_offset); - __bang_write_zero(nram_anchor_w, deal_num); - __bang_write_zero(nram_anchor_h, deal_num); + __bang_write_value(nram_anchor_w, deal_num, 0); + __bang_write_value(nram_anchor_h, deal_num, 0); initAnchorParam(anchors, nram_anchor_w, nram_anchor_h, deal_n_num, anchor_s, anchor_s, 0, align_hw_seg_num); @@ -467,8 +467,8 @@ __mlu_func__ void YoloBoxComputeBbox( // C int *addr_img_size = (int *)img_size; - __bang_write_zero(nram_img_w, deal_num); - __bang_write_zero(nram_img_h, deal_num); + __bang_write_value(nram_img_w, deal_num, 0); + __bang_write_value(nram_img_h, deal_num, 0); initImgParam(addr_img_size, nram_img_w, nram_img_h, deal_n_num, anchor_s, align_hw_seg_num); compute(nram_x, nram_y, nram_w, nram_h, nram_conf, nram_iou, nram_cx, @@ -494,8 +494,8 @@ __mlu_func__ void YoloBoxComputeBbox( // C int *addr_img_size = (int *)img_size + (n_iter + 1) * deal_n_num * 2; - __bang_write_zero(nram_img_w, deal_num); - __bang_write_zero(nram_img_h, deal_num); + __bang_write_value(nram_img_w, deal_num, 0); + __bang_write_value(nram_img_h, deal_num, 0); initImgParam(addr_img_size, nram_img_w, nram_img_h, deal_n_num, anchor_s, align_hw_seg_num); compute(nram_x, nram_y, nram_w, nram_h, nram_conf, nram_iou, nram_cx, @@ -525,8 +525,8 @@ __mlu_func__ void YoloBoxComputeBbox( if (repeat_n > 0) { // C int *addr_img_size = (int *)img_size + (repeat_n - 1) * deal_n_num * 2; - __bang_write_zero(nram_img_w, deal_num); - __bang_write_zero(nram_img_h, deal_num); + __bang_write_value(nram_img_w, deal_num, 0); + __bang_write_value(nram_img_h, deal_num, 0); initImgParam(addr_img_size, nram_img_w, nram_img_h, deal_n_num, anchor_s, align_hw_seg_num); compute(nram_x, nram_y, nram_w, nram_h, nram_conf, nram_iou, nram_cx, @@ -548,8 +548,8 @@ __mlu_func__ void YoloBoxComputeBbox( if (rem_n_num > 0) { // C int *addr_img_size = (int *)img_size + repeat_n * deal_n_num * 2; - __bang_write_zero(nram_img_w, deal_num); - __bang_write_zero(nram_img_h, deal_num); + __bang_write_value(nram_img_w, deal_num, 0); + __bang_write_value(nram_img_h, deal_num, 0); initImgParam(addr_img_size, nram_img_w, nram_img_h, rem_n_num, anchor_s, align_hw_seg_num); compute(nram_x, nram_y, nram_w, nram_h, nram_conf, nram_iou, nram_cx, @@ -575,8 +575,8 @@ __mlu_func__ void YoloBoxComputeBbox( T *base_addr_x = (T *)x + hw_data_offset; T *base_addr_boxes = (T *)boxes + hw_data_offset; - __bang_write_zero(nram_cx, deal_num); - __bang_write_zero(nram_cy, deal_num); + __bang_write_value(nram_cx, deal_num, (T)0); + __bang_write_value(nram_cy, deal_num, (T)0); initCxyParam(nram_cx, nram_cy, 1, deal_s_num, w_in, hw_seg_num, align_hw_seg_num, hw_data_offset); @@ -617,14 +617,14 @@ __mlu_func__ void YoloBoxComputeBbox( } // C - __bang_write_zero(nram_anchor_w, deal_num); - __bang_write_zero(nram_anchor_h, deal_num); + __bang_write_value(nram_anchor_w, deal_num, 0); + __bang_write_value(nram_anchor_h, deal_num, 0); initAnchorParam(anchors, nram_anchor_w, nram_anchor_h, 1, anchor_s, deal_s_num, 0, align_hw_seg_num); int *addr_img_size = (int *)img_size; - __bang_write_zero(nram_img_w, deal_num); - __bang_write_zero(nram_img_h, deal_num); + __bang_write_value(nram_img_w, deal_num, 0); + __bang_write_value(nram_img_h, deal_num, 0); initImgParam(addr_img_size, nram_img_w, nram_img_h, 1, deal_s_num, align_hw_seg_num); compute(nram_x, nram_y, nram_w, nram_h, nram_conf, nram_iou, nram_cx, @@ -671,8 +671,8 @@ __mlu_func__ void YoloBoxComputeBbox( } // C - __bang_write_zero(nram_anchor_w, deal_num); - __bang_write_zero(nram_anchor_h, deal_num); + __bang_write_value(nram_anchor_w, deal_num, 0); + __bang_write_value(nram_anchor_h, deal_num, 0); int anchor_offset = (ns_iter + 1) * deal_s_num; initAnchorParam(anchors, nram_anchor_w, nram_anchor_h, 1, anchor_s, deal_s_num, anchor_offset, align_hw_seg_num); @@ -684,8 +684,8 @@ __mlu_func__ void YoloBoxComputeBbox( anchor_num = anchor_s - s_num_offset; next_batch = deal_s_num > anchor_num; anchor_num = next_batch ? anchor_num : deal_s_num; - __bang_write_zero(nram_img_w, deal_num); - __bang_write_zero(nram_img_h, deal_num); + __bang_write_value(nram_img_w, deal_num, 0); + __bang_write_value(nram_img_h, deal_num, 0); initImgParam(addr_img_size, nram_img_w, nram_img_h, 1, anchor_num, align_hw_seg_num); @@ -728,8 +728,8 @@ __mlu_func__ void YoloBoxComputeBbox( } if (repeat_ns > 0) { // C - __bang_write_zero(nram_anchor_w, deal_num); - __bang_write_zero(nram_anchor_h, deal_num); + __bang_write_value(nram_anchor_w, deal_num, 0); + __bang_write_value(nram_anchor_h, deal_num, 0); int anchor_offset = (repeat_ns - 1) * deal_s_num; initAnchorParam(anchors, nram_anchor_w, nram_anchor_h, 1, anchor_s, deal_s_num, anchor_offset, align_hw_seg_num); @@ -741,8 +741,8 @@ __mlu_func__ void YoloBoxComputeBbox( int anchor_num = anchor_s - s_num_offset; bool next_batch = deal_s_num > anchor_num; anchor_num = next_batch ? anchor_num : deal_s_num; - __bang_write_zero(nram_img_w, deal_num); - __bang_write_zero(nram_img_h, deal_num); + __bang_write_value(nram_img_w, deal_num, 0); + __bang_write_value(nram_img_h, deal_num, 0); initImgParam(addr_img_size, nram_img_w, nram_img_h, 1, anchor_num, align_hw_seg_num); @@ -774,8 +774,8 @@ __mlu_func__ void YoloBoxComputeBbox( } if (rem_ns_num > 0) { // C - __bang_write_zero(nram_anchor_w, deal_num); - __bang_write_zero(nram_anchor_h, deal_num); + __bang_write_value(nram_anchor_w, deal_num, 0); + __bang_write_value(nram_anchor_h, deal_num, 0); int anchor_offset = repeat_ns * deal_s_num; initAnchorParam(anchors, nram_anchor_w, nram_anchor_h, 1, anchor_s, rem_ns_num, anchor_offset, align_hw_seg_num); @@ -783,8 +783,8 @@ __mlu_func__ void YoloBoxComputeBbox( // init img w/h int batch_num = anchor_offset / anchor_s; int *addr_img_size = (int *)img_size + batch_num * 2; - __bang_write_zero(nram_img_w, deal_num); - __bang_write_zero(nram_img_h, deal_num); + __bang_write_value(nram_img_w, deal_num, 0); + __bang_write_value(nram_img_h, deal_num, 0); initImgParam(addr_img_size, nram_img_w, nram_img_h, 1, rem_ns_num, align_hw_seg_num); @@ -817,13 +817,13 @@ __mlu_func__ void YoloBoxComputeBbox( T *addr_boxes_n = base_addr_boxes + n_iter * output_stride + s_iter * 4 * hw_total_num; - __bang_write_zero(nram_anchor_w, deal_num); - __bang_write_zero(nram_anchor_h, deal_num); + __bang_write_value(nram_anchor_w, deal_num, 0); + __bang_write_value(nram_anchor_h, deal_num, 0); initAnchorParam(anchors, nram_anchor_w, nram_anchor_h, 1, anchor_s, 1, s_iter, deal_num); - __bang_write_zero(nram_img_w, deal_num); - __bang_write_zero(nram_img_h, deal_num); + __bang_write_value(nram_img_w, deal_num, 0); + __bang_write_value(nram_img_h, deal_num, 0); int *addr_img_size = (int *)img_size + n_iter * 2; initImgParam(addr_img_size, nram_img_w, nram_img_h, 1, 1, deal_num); @@ -844,8 +844,8 @@ __mlu_func__ void YoloBoxComputeBbox( deal_num, deal_num, nram_pingpong_num, 1); // C - __bang_write_zero(nram_cx, deal_num); - __bang_write_zero(nram_cy, deal_num); + __bang_write_value(nram_cx, deal_num, (T)0); + __bang_write_value(nram_cy, deal_num, (T)0); int hw_offset = hw_data_offset; initCxyParam(nram_cx, nram_cy, 1, 1, w_in, deal_num, deal_num, hw_offset); @@ -870,8 +870,8 @@ __mlu_func__ void YoloBoxComputeBbox( deal_num, deal_num, nram_pingpong_num, hw_iter + 2); // C - __bang_write_zero(nram_cx, deal_num); - __bang_write_zero(nram_cy, deal_num); + __bang_write_value(nram_cx, deal_num, (T)0); + __bang_write_value(nram_cy, deal_num, (T)0); int hw_offset = hw_data_offset + (hw_iter + 1) * deal_num; initCxyParam(nram_cx, nram_cy, 1, 1, w_in, deal_num, deal_num, hw_offset); @@ -899,8 +899,8 @@ __mlu_func__ void YoloBoxComputeBbox( } if (repeat_hw > 0) { // C - __bang_write_zero(nram_cx, deal_num); - __bang_write_zero(nram_cy, deal_num); + __bang_write_value(nram_cx, deal_num, (T)0); + __bang_write_value(nram_cy, deal_num, (T)0); int hw_offset = hw_data_offset + (repeat_hw - 1) * deal_num; initCxyParam(nram_cx, nram_cy, 1, 1, w_in, deal_num, deal_num, hw_offset); @@ -921,8 +921,8 @@ __mlu_func__ void YoloBoxComputeBbox( } if (rem_hw_num > 0) { // C - __bang_write_zero(nram_cx, deal_num); - __bang_write_zero(nram_cy, deal_num); + __bang_write_value(nram_cx, deal_num, (T)0); + __bang_write_value(nram_cy, deal_num, (T)0); int hw_offset = hw_data_offset + repeat_hw * deal_num; initCxyParam(nram_cx, nram_cy, 1, 1, w_in, rem_hw_num, deal_num, hw_offset); diff --git a/test/mlu_op_gtest/pb_gtest/include/runtime.h b/test/mlu_op_gtest/pb_gtest/include/runtime.h index 9bc9fdeaa..6d5bc22f7 100644 --- a/test/mlu_op_gtest/pb_gtest/include/runtime.h +++ b/test/mlu_op_gtest/pb_gtest/include/runtime.h @@ -38,7 +38,7 @@ #include "memory_pool.h" #ifndef CNRT_RET_ERR_INVALID -#define CNRT_RET_ERR_INVALID (632007) +#define CNRT_RET_ERR_INVALID ((cnrtRet_t)632007) #endif namespace mluoptest { diff --git a/test/mlu_op_gtest/pb_gtest/src/executor.cpp b/test/mlu_op_gtest/pb_gtest/src/executor.cpp index b8ac27704..626f6c987 100644 --- a/test/mlu_op_gtest/pb_gtest/src/executor.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/executor.cpp @@ -380,7 +380,7 @@ void Executor::setupForPerfIter(int repeat, int iter, int iter_start) { if (perfUseOriginData()) { void *src_data = getPerfSrcData(db); GTEST_CHECK(cnrtMemcpy(db->device_perf_ptr, src_data, db->size, - CNRT_MEM_TRANS_DIR_DEV2DEV) == + cnrtMemcpyDevToDev) == cnrtSuccess); oss << "copy data from " << src_data; } else { @@ -460,7 +460,7 @@ void Executor::setupForPerfIter(int repeat, int iter, int iter_start) { if (skipMallocDevice(db.getMetaTensor())) continue; void *src_data = getPerfSrcData(&db); GTEST_CHECK(cnrtMemcpy(db.device_perf_ptr, src_data, db.size, - CNRT_MEM_TRANS_DIR_DEV2DEV) == cnrtSuccess); + cnrtMemcpyDevToDev) == cnrtSuccess); } } } @@ -2148,7 +2148,7 @@ void Executor::copyIn() { VLOG(4) << "copy from device_origin_ptr to device_perf_data_ptr"; GTEST_CHECK(cnrtSuccess == cnrtMemcpy(db->device_perf_data_ptr, db->device_origin_ptr, - db->size, CNRT_MEM_TRANS_DIR_DEV2DEV)); + db->size, cnrtMemcpyDevToDev)); } // for debug if (exe_config_->dump_data) { diff --git a/test/mlu_op_gtest/pb_gtest/src/hardware_monitor.cpp b/test/mlu_op_gtest/pb_gtest/src/hardware_monitor.cpp index 0e5f808ec..14d3a026a 100644 --- a/test/mlu_op_gtest/pb_gtest/src/hardware_monitor.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/hardware_monitor.cpp @@ -65,10 +65,6 @@ void hardwareMonitor::start() { std::bind(&hardwareMonitor::monitorFrequencyOneGRepeat, this), monitor_hwtime); })); - monitor_threads.emplace_back(std::thread([&, this] { - monitorAllGRepeat(std::bind(&hardwareMonitor::monitorPowerOneGRepeat, this), - monitor_hwtime); - })); monitor_hwtime = true; monitor_threads.emplace_back(std::thread([&, this] { monitorAllGRepeat( @@ -166,62 +162,6 @@ void hardwareMonitor::monitorFrequencyOneGRepeat() { << "us per call."; } -void hardwareMonitor::monitorPowerOneGRepeat() { - std::ofstream power_file(results_dir + "/power_device_" + - std::to_string(global_var.dev_id_) + ".csv", - std::ios::app); - power_file << "relative_time(ns),instantaneous_power(W),average_power(W)\n"; - cndevDevice_t dev_id; - GTEST_CHECK(cnrtGetDevice(&dev_id) == cnrtSuccess); - GTEST_CHECK(cndevInit(0) == CNDEV_SUCCESS); - int i = 1; - - cndevPowerInfo_t power_info_prev, power_info_curr; - size_t t_prev, t_curr; - auto getPower = [&, this]() { - power_info_curr.version = CNDEV_VERSION_5; - t_curr = MONITOR_CLOCK::now().time_since_epoch().count() - start_time_point; - // TODO(None): cntoolkit-3.6, use cndevGetDevicePower - // GTEST_CHECK(cndevGetDevicePower(&power_info_curr, dev_id) == - // CNDEV_SUCCESS); - GTEST_CHECK(cndevGetPowerInfo(&power_info_curr, dev_id) == CNDEV_SUCCESS); - }; - - MONITOR_CLOCK::time_point t1 = MONITOR_CLOCK::now(); - getPower(); - std::tie(t_prev, power_info_prev) = std::make_tuple(t_curr, power_info_curr); - power_file << t_prev << "," - << (uint32_t)(power_info_prev.instantaneousPowerUsage) << "," - << power_info_prev.usage << "\n"; - while (!status.finish_one_grepeat) { - ++i; - getPower(); - if (power_info_prev.instantaneousPowerUsage != - power_info_curr.instantaneousPowerUsage || - power_info_prev.usage != power_info_curr.usage) { - power_file << t_prev << "," - << (uint32_t)(power_info_prev.instantaneousPowerUsage) << "," - << power_info_prev.usage << "\n"; - power_file << t_curr << "," - << (uint32_t)(power_info_curr.instantaneousPowerUsage) << "," - << power_info_curr.usage << "\n"; - power_info_prev = power_info_curr; - } - t_prev = t_curr; - } - power_file << t_curr << "," - << (uint32_t)(power_info_curr.instantaneousPowerUsage) << "," - << power_info_curr.usage << "\n"; - MONITOR_CLOCK::time_point t2 = MONITOR_CLOCK::now(); - auto time_span = - std::chrono::duration_cast>(t2 - - t1); - // TODO(None): cntoolkit-3.6, remove this warning - LOG(WARNING) << "From cntoolkit-3.6 onward, use cndevGetDevicePower."; - VLOG(4) << "cndevGetDevicePower took " << time_span.count() / i - << "us per call."; -} - void hardwareMonitor::monitorHwtimeOneGRepeat() { { std::unique_lock lock(status.monitor_mutex); From acdae23187e7c912a88d72f76b7490e7510d9be2 Mon Sep 17 00:00:00 2001 From: nth-BYTE <160582271+nth-BYTE@users.noreply.github.com> Date: Tue, 31 Dec 2024 15:02:23 +0800 Subject: [PATCH 3/3] [Feature](mlu-ops): Access variable in tensor struct through function in mlu_op_gtest (#1152) Co-authored-by: nizhijie --- .../transpose_cpu/transpose_cpu.cpp | 10 +-- .../src/zoo/bbox_overlaps/bbox_overlaps.cpp | 4 +- .../border_align_backward.cpp | 30 ++++---- .../border_align_forward.cpp | 16 ++-- .../zoo/box_iou_rotated/box_iou_rotated.cpp | 14 ++-- .../dcn_backward_data/dcn_backward_data.cpp | 8 +- .../dcn_backward_weight.cpp | 58 +++++++-------- .../src/zoo/dcn_forward/dcn_forward.cpp | 56 +++++++------- .../deform_roi_pool_backward.cpp | 10 +-- .../deform_roi_pool_forward.cpp | 14 ++-- ...diff_iou_rotated_sort_vertices_forward.cpp | 6 +- .../mlu_op_gtest/pb_gtest/src/zoo/div/div.cpp | 14 ++-- .../dynamic_point_to_voxel_backward.cpp | 4 +- .../dynamic_point_to_voxel_forward.cpp | 9 ++- .../mlu_op_gtest/pb_gtest/src/zoo/fft/fft.cpp | 8 +- .../generate_proposals_v2.cpp | 18 ++--- .../zoo/get_indice_pairs/get_indice_pairs.cpp | 12 +-- .../indice_convolution_backward_data.cpp | 65 ++++++++-------- .../indice_convolution_backward_filter.cpp | 53 ++++++------- .../indice_convolution_forward.cpp | 61 +++++++-------- .../pb_gtest/src/zoo/logspace/logspace.cpp | 2 +- .../masked_col2im_forward.cpp | 10 +-- .../masked_im2col_forward.cpp | 10 +-- .../moe_dispatch_backward_data.cpp | 12 +-- .../moe_dispatch_forward.cpp | 12 +-- .../ms_deform_attn_backward.cpp | 26 +++---- .../ms_deform_attn_forward.cpp | 36 ++++----- .../mutual_information_backward.cpp | 6 +- .../mutual_information_forward.cpp | 6 +- .../mlu_op_gtest/pb_gtest/src/zoo/nms/nms.cpp | 74 +++++++++---------- .../src/zoo/nms_rotated/nms_rotated.cpp | 6 +- .../zoo/points_in_boxes/points_in_boxes.cpp | 18 +++-- .../pb_gtest/src/zoo/poly_nms/poly_nms.cpp | 4 +- .../pb_gtest/src/zoo/prior_box/prior_box.cpp | 12 +-- .../zoo/psamask_backward/psamask_backward.cpp | 12 +-- .../zoo/psamask_forward/psamask_forward.cpp | 12 +-- .../psroipool_backward/psroipool_backward.cpp | 12 +-- .../psroipool_forward/psroipool_forward.cpp | 12 +-- .../roi_align_backward/roi_align_backward.cpp | 36 ++++----- .../roi_align_rotated_backward.cpp | 18 ++--- .../roi_align_rotated_forward.cpp | 18 ++--- .../roi_crop_backward/roi_crop_backward.cpp | 14 ++-- .../zoo/roi_crop_forward/roi_crop_forward.cpp | 14 ++-- .../roi_pooling_backward.cpp | 46 ++++++------ .../roi_pooling_forward.cpp | 2 +- .../zoo/roialign_forward/roialign_forward.cpp | 56 +++++++------- .../roiaware_pool3d_backward.cpp | 2 +- .../zoo/roipoint_pool3d/roipoint_pool3d.cpp | 38 +++++----- .../rotated_feature_align_backward.cpp | 10 +-- .../rotated_feature_align_forward.cpp | 10 +-- .../sync_batch_norm_backward_elemt.cpp | 6 +- .../sync_batchnorm_backward_elemt_v2.cpp | 8 +- .../sync_batchnorm_backward_reduce.cpp | 3 +- .../sync_batchnorm_elemt.cpp | 3 +- ...ync_batchnorm_gather_stats_with_counts.cpp | 28 +++---- .../sync_batchnorm_stats.cpp | 8 +- .../three_interpolate_backward.cpp | 8 +- .../three_interpolate_forward.cpp | 8 +- .../tin_shift_backward/tin_shift_backward.cpp | 12 +-- .../tin_shift_forward/tin_shift_forward.cpp | 12 +-- .../src/zoo/voxelization/voxelization.cpp | 10 +-- .../pb_gtest/src/zoo/yolo_box/yolo_box.cpp | 6 +- 62 files changed, 573 insertions(+), 555 deletions(-) diff --git a/test/mlu_op_gtest/pb_gtest/src/internal_kernel/transpose_cpu/transpose_cpu.cpp b/test/mlu_op_gtest/pb_gtest/src/internal_kernel/transpose_cpu/transpose_cpu.cpp index 3a807147d..8065cea76 100644 --- a/test/mlu_op_gtest/pb_gtest/src/internal_kernel/transpose_cpu/transpose_cpu.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/internal_kernel/transpose_cpu/transpose_cpu.cpp @@ -89,7 +89,7 @@ mluOpStatus_t mluOpTransposeCpu(const int64_t dim_desc, PARAM_CHECK("[cnnlTransposeCpu]", y_desc != NULL); uint64_t sum = mluOpGetTensorElementNum(x_desc); // zero elements, return success - if (sum == 0 || x_desc->dim == 0 || y_desc->dim == 0) { + if (sum == 0 || x_desc->getDim() == 0 || y_desc->getDim() == 0) { VLOG(5) << "cnnlTransposeCpu:: zero elements, return success."; return MLUOP_STATUS_SUCCESS; } @@ -97,7 +97,7 @@ mluOpStatus_t mluOpTransposeCpu(const int64_t dim_desc, PARAM_CHECK("[cnnlTransposeCpu]", y != NULL); const uint64_t dim_all = dim_desc; - auto data_type = x_desc->dtype; + auto data_type = x_desc->getDtype(); int loop_d = 1; if (data_type == MLUOP_DTYPE_INT31) { loop_d = 2; @@ -112,17 +112,17 @@ mluOpStatus_t mluOpTransposeCpu(const int64_t dim_desc, uint64_t DIM[TRANSPOSE_MAX_DIM + 1] = {1, 1, 1, 1, 1, 1, 1, 1, 1}; uint64_t dim[TRANSPOSE_MAX_DIM + 1] = {0}; - if (x_desc->dim != dim_all || y_desc->dim != dim_all) { + if (x_desc->getDim() != dim_all || y_desc->getDim() != dim_all) { LOG(ERROR) << "cnnlTransposeCpu: dimension information mismatch, dim of x: " - << x_desc->dim << ", dim of y: " << y_desc->dim + << x_desc->getDim() << ", dim of y: " << y_desc->getDim() << ", dim of descriptor: " << dim_all; return MLUOP_STATUS_BAD_PARAM; } for (int i = 0; i < dim_all; i++) { permute[i] = permute_desc[i]; - DIM[i] = x_desc->dims[i]; + DIM[i] = x_desc->getDimIndex(i); } if (MLUOP_DTYPE_INT31 == data_type) { transposeCpuNd(loop_d, (int16_t *)x, (int16_t *)y, sum, dim, DIM, permute); diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/bbox_overlaps/bbox_overlaps.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/bbox_overlaps/bbox_overlaps.cpp index 696e702c8..8998017fd 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/bbox_overlaps/bbox_overlaps.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/bbox_overlaps/bbox_overlaps.cpp @@ -152,8 +152,8 @@ void BboxOverlapsExecutor::cpuCompute() { auto bbox1_desc = tensor_desc_[0].tensor; auto bbox2_desc = tensor_desc_[1].tensor; - int rows = bbox1_desc->dims[0]; - int cols = bbox2_desc->dims[0]; + int rows = bbox1_desc->getDimIndex(0); + int cols = bbox2_desc->getDimIndex(0); // get struct param int mode = parser_->getProtoNode()->bbox_overlaps_param().mode(); diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/border_align_backward/border_align_backward.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/border_align_backward/border_align_backward.cpp index b68137b24..c76b03475 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/border_align_backward/border_align_backward.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/border_align_backward/border_align_backward.cpp @@ -113,19 +113,19 @@ void BorderAlignBackwardExecutor::cpuCompute() { float *boxes = cpu_fp32_input_[1]; float *argmax_idx = cpu_fp32_input_[2]; float *grad_input = cpu_fp32_output_[0]; - const int32_t box_size = boxes_desc->dims[1]; - const int32_t channels = grad_output_desc->dims[3]; - const int32_t height = grad_input_desc->dims[1]; - const int32_t width = grad_input_desc->dims[2]; - const int32_t N = grad_output_desc->dims[0]; - const int32_t H = grad_output_desc->dims[1]; - const int32_t W = grad_output_desc->dims[2]; - const int32_t C = grad_output_desc->dims[3]; + const int32_t box_size = boxes_desc->getDimIndex(1); + const int32_t channels = grad_output_desc->getDimIndex(3); + const int32_t height = grad_input_desc->getDimIndex(1); + const int32_t width = grad_input_desc->getDimIndex(2); + const int32_t N = grad_output_desc->getDimIndex(0); + const int32_t H = grad_output_desc->getDimIndex(1); + const int32_t W = grad_output_desc->getDimIndex(2); + const int32_t C = grad_output_desc->getDimIndex(3); - const int32_t N1 = grad_input_desc->dims[0]; - const int32_t H1 = grad_input_desc->dims[1]; - const int32_t W1 = grad_input_desc->dims[2]; - const int32_t C1 = grad_input_desc->dims[3]; + const int32_t N1 = grad_input_desc->getDimIndex(0); + const int32_t H1 = grad_input_desc->getDimIndex(1); + const int32_t W1 = grad_input_desc->getDimIndex(2); + const int32_t C1 = grad_input_desc->getDimIndex(3); float x_stride = 0; float y_stride = 0; float stride = 0; @@ -260,9 +260,9 @@ void BorderAlignBackwardExecutor::cpuCompute() { int64_t BorderAlignBackwardExecutor::getTheoryOps() { auto input_desc = parser_->getMetaTensor(0).tensor; auto boxes_desc = parser_->getMetaTensor(1).tensor; - const int32_t N = input_desc->dims[0]; - const int32_t C = input_desc->dims[3] / 4; - const int32_t K = boxes_desc->dims[1]; + const int32_t N = input_desc->getDimIndex(0); + const int32_t C = input_desc->getDimIndex(3) / 4; + const int32_t K = boxes_desc->getDimIndex(1); const int64_t theory_ops = N * K * 4 * C * 3; return theory_ops; diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/border_align_forward/border_align_forward.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/border_align_forward/border_align_forward.cpp index 04560c592..652f87a88 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/border_align_forward/border_align_forward.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/border_align_forward/border_align_forward.cpp @@ -113,11 +113,11 @@ float bilinear_interpolate(const float *input, const int32_t H, const int32_t W, void BorderAlignForwardExecutor::cpuCompute() { auto input_desc = parser_->getMetaTensor(0).tensor; auto boxes_desc = parser_->getMetaTensor(1).tensor; - const int32_t N = input_desc->dims[0]; - const int32_t H = input_desc->dims[1]; - const int32_t W = input_desc->dims[2]; - const int32_t C = input_desc->dims[3] / 4; - const int32_t K = boxes_desc->dims[1]; + const int32_t N = input_desc->getDimIndex(0); + const int32_t H = input_desc->getDimIndex(1); + const int32_t W = input_desc->getDimIndex(2); + const int32_t C = input_desc->getDimIndex(3) / 4; + const int32_t K = boxes_desc->getDimIndex(1); float x1, x2, y1, y2; float x_stride = 0; float y_stride = 0; @@ -195,9 +195,9 @@ void BorderAlignForwardExecutor::cpuCompute() { int64_t BorderAlignForwardExecutor::getTheoryOps() { auto input_desc = parser_->getMetaTensor(0).tensor; auto boxes_desc = parser_->getMetaTensor(1).tensor; - const int32_t N = input_desc->dims[0]; - const int32_t C = input_desc->dims[3] / 4; - const int32_t K = boxes_desc->dims[1]; + const int32_t N = input_desc->getDimIndex(0); + const int32_t C = input_desc->getDimIndex(3) / 4; + const int32_t K = boxes_desc->getDimIndex(1); const int64_t theory_ops = N * K * 4 * C * 14; return theory_ops; diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/box_iou_rotated/box_iou_rotated.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/box_iou_rotated/box_iou_rotated.cpp index f413b8184..c69585364 100755 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/box_iou_rotated/box_iou_rotated.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/box_iou_rotated/box_iou_rotated.cpp @@ -62,8 +62,8 @@ void BoxIouRotatedExecutor::cpuCompute() { auto box1_desc = tensor_desc_[0].tensor; auto box2_desc = tensor_desc_[1].tensor; - auto num_box1 = box1_desc->dims[0]; - auto num_box2 = box2_desc->dims[0]; + auto num_box1 = box1_desc->getDimIndex(0); + auto num_box2 = box2_desc->getDimIndex(0); int mode = parser_->getProtoNode()->box_iou_rotated_param().mode(); bool aligned = parser_->getProtoNode()->box_iou_rotated_param().aligned(); @@ -85,17 +85,17 @@ void BoxIouRotatedExecutor::cpuBoxIouRotated(const T *box1_raw, VLOG(4) << "num box1: " << num_box1; VLOG(4) << "num box2: " << num_box2; if (aligned) { - int num_ious = tensor_desc_[2].tensor->dims[0]; + int num_ious = tensor_desc_[2].tensor->getDimIndex(0); VLOG(4) << "num_ious: " << num_ious; GTEST_CHECK(num_box1 == num_ious, "when aligned, num_box1 should equal to num_ious."); } else { - int num_ious = tensor_desc_[2].tensor->dims[0]; + int num_ious = tensor_desc_[2].tensor->getDimIndex(0); VLOG(4) << "num_ious[0]: " << num_ious; - num_ious = tensor_desc_[2].tensor->dims[1]; + num_ious = tensor_desc_[2].tensor->getDimIndex(1); VLOG(4) << "num_ious[1]: " << num_ious; - GTEST_CHECK(((num_box1 == tensor_desc_[2].tensor->dims[0]) || - (num_box2 == tensor_desc_[2].tensor->dims[1])), + GTEST_CHECK(((num_box1 == tensor_desc_[2].tensor->getDimIndex(0)) || + (num_box2 == tensor_desc_[2].tensor->getDimIndex(1))), "when not aligned, num_ious should equal to num_box1*num_box2"); } diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/dcn_backward_data/dcn_backward_data.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/dcn_backward_data/dcn_backward_data.cpp index 4c3b52779..c601e8c55 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/dcn_backward_data/dcn_backward_data.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/dcn_backward_data/dcn_backward_data.cpp @@ -101,8 +101,8 @@ void DcnBackwardDataExecutor::workspaceMalloc() { grad_offset_desc_ = parser_->outputs()[1].tensor; grad_mask_desc_ = use_grad_mask_ ? parser_->outputs()[2].tensor : nullptr; - grad_output_desc_->onchip_dtype = grad_output_oc_dt_; - weight_desc_->onchip_dtype = weight_oc_dt_; + grad_output_desc_->setOnchipDtype(grad_output_oc_dt_); + weight_desc_->setOnchipDtype(weight_oc_dt_); dcn_desc_ = cpu_runtime_.allocate(mluOpCreateDCNDescriptor, mluOpDestroyDCNDescriptor); @@ -153,8 +153,8 @@ void DcnBackwardDataExecutor::compute() { void *dev_grad_mask = use_mask_ ? data_vector_[6 + use_mask_].device_ptr : nullptr; - grad_output_desc_->onchip_dtype = grad_output_oc_dt_; - weight_desc_->onchip_dtype = weight_oc_dt_; + grad_output_desc_->setOnchipDtype(grad_output_oc_dt_); + weight_desc_->setOnchipDtype(weight_oc_dt_); VLOG(4) << "call mluOpDCNBackwardData()"; interface_timer_.start(); diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/dcn_backward_weight/dcn_backward_weight.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/dcn_backward_weight/dcn_backward_weight.cpp index f8a89520e..647568dc0 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/dcn_backward_weight/dcn_backward_weight.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/dcn_backward_weight/dcn_backward_weight.cpp @@ -163,8 +163,8 @@ void DcnBackwardWeightExecutor::workspaceMalloc() { parser_->getOutputNum() == 1 ? nullptr : tensor_desc_[5].tensor; } - input_desc->onchip_dtype = input_onchip_dtype; - grad_output_desc->onchip_dtype = grad_output_onchip_dtype; + input_desc->setOnchipDtype(input_onchip_dtype); + grad_output_desc->setOnchipDtype(grad_output_onchip_dtype); MLUOP_CHECK(mluOpGetDCNBackwardWeightWorkspaceSize( handle_, dcn_desc, input_desc, offset_desc, mask_desc, grad_output_desc, grad_weight_desc, grad_bias_desc, &workspace_size)); @@ -191,7 +191,7 @@ void DcnBackwardWeightExecutor::compute() { if (dcn_param.has_compute_type()) { compute_type = cvtProtoDtypeToMluOp(dcn_param.compute_type()); } else { - compute_type = input_desc->dtype; + compute_type = input_desc->getDtype(); } mluOpDCNDescriptor_t dcn_desc = cpu_runtime_.allocate( @@ -226,8 +226,8 @@ void DcnBackwardWeightExecutor::compute() { parser_->getOutputNum() == 1 ? nullptr : data_vector_[5].device_ptr; } - input_desc->onchip_dtype = input_onchip_dtype; - grad_output_desc->onchip_dtype = grad_output_onchip_dtype; + input_desc->setOnchipDtype(input_onchip_dtype); + grad_output_desc->setOnchipDtype(grad_output_onchip_dtype); VLOG(4) << "call mluOpDCNBackwardWeight()"; interface_timer_.start(); @@ -444,15 +444,15 @@ void DcnBackwardWeightExecutor::computeDCNBackwardWeightCPU( const mluOpTensorDescriptor_t grad_bias_desc, void *cpu_grad_bias, float *buffer, int pad[], int stride[], int dilation[], int64_t &theory_ops) { - const int N = input_desc->dims[0]; - const int hi = input_desc->dims[1]; - const int wi = input_desc->dims[2]; - const int ci = input_desc->dims[3]; - const int ho = offset_desc->dims[1]; - const int wo = offset_desc->dims[2]; - const int co = grad_output_desc->dims[3]; - const int kh = grad_weight_desc->dims[1]; - const int kw = grad_weight_desc->dims[2]; + const int N = input_desc->getDimIndex(0); + const int hi = input_desc->getDimIndex(1); + const int wi = input_desc->getDimIndex(2); + const int ci = input_desc->getDimIndex(3); + const int ho = offset_desc->getDimIndex(1); + const int wo = offset_desc->getDimIndex(2); + const int co = grad_output_desc->getDimIndex(3); + const int kh = grad_weight_desc->getDimIndex(1); + const int kw = grad_weight_desc->getDimIndex(2); const int pt = pad[0]; const int pb = pad[1]; const int pl = pad[2]; @@ -579,12 +579,12 @@ void DcnBackwardWeightExecutor::cpuCompute() { parser_->getOutputNum() == 1 ? nullptr : cpu_fp32_output_[1]; } - const int ho = offset_desc->dims[1]; - const int wo = offset_desc->dims[2]; - const int kh = grad_weight_desc->dims[1]; - const int kw = grad_weight_desc->dims[2]; - const int ci = input_desc->dims[3]; - const int co = grad_output_desc->dims[3]; + const int ho = offset_desc->getDimIndex(1); + const int wo = offset_desc->getDimIndex(2); + const int kh = grad_weight_desc->getDimIndex(1); + const int kw = grad_weight_desc->getDimIndex(2); + const int ci = input_desc->getDimIndex(3); + const int co = grad_output_desc->getDimIndex(3); size_t cpu_buffer_size = 0; if (g == 1) { @@ -634,15 +634,15 @@ int64_t DcnBackwardWeightExecutor::getTheoryOps() { grad_bias_desc = parser_->getOutputNum() == 1 ? nullptr : tensor_desc_[5].tensor; } - const int N = input_desc->dims[0]; - const int hi = input_desc->dims[1]; - const int wi = input_desc->dims[2]; - const int ci = input_desc->dims[3]; - const int ho = offset_desc->dims[1]; - const int wo = offset_desc->dims[2]; - const int co = grad_output_desc->dims[3]; - const int kh = grad_weight_desc->dims[1]; - const int kw = grad_weight_desc->dims[2]; + const int N = input_desc->getDimIndex(0); + const int hi = input_desc->getDimIndex(1); + const int wi = input_desc->getDimIndex(2); + const int ci = input_desc->getDimIndex(3); + const int ho = offset_desc->getDimIndex(1); + const int wo = offset_desc->getDimIndex(2); + const int co = grad_output_desc->getDimIndex(3); + const int kh = grad_weight_desc->getDimIndex(1); + const int kw = grad_weight_desc->getDimIndex(2); int coeff = getCoefficientOfLT2CT(); const int k = im2col_step * ho * wo; const int m = co / g; diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/dcn_forward/dcn_forward.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/dcn_forward/dcn_forward.cpp index 39d56f13e..47803de4a 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/dcn_forward/dcn_forward.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/dcn_forward/dcn_forward.cpp @@ -158,8 +158,8 @@ void DcnForwardExecutor::workspaceMalloc() { output_desc = tensor_desc_[5].tensor; } - input_desc->onchip_dtype = input_onchip_dtype; - weight_desc->onchip_dtype = weight_onchip_dtype; + input_desc->setOnchipDtype(input_onchip_dtype); + weight_desc->setOnchipDtype(weight_onchip_dtype); MLUOP_CHECK(mluOpGetDCNForwardWorkspaceSize( handle_, dcn_desc, input_desc, offset_desc, mask_desc, weight_desc, @@ -236,8 +236,8 @@ void DcnForwardExecutor::compute() { output = data_vector_[5].device_ptr; } - input_desc->onchip_dtype = input_onchip_dtype; - weight_desc->onchip_dtype = weight_onchip_dtype; + input_desc->setOnchipDtype(input_onchip_dtype); + weight_desc->setOnchipDtype(weight_onchip_dtype); VLOG(4) << "call mluOpDCNForward()"; interface_timer_.start(); @@ -447,15 +447,15 @@ void DcnForwardExecutor::computeDCNForwardCPU( const mluOpTensorDescriptor_t output_desc, const void *cpu_output, float *buffer, int pad[], int stride[], int dilation[], int64_t &theory_ops) { - const int N = input_desc->dims[0]; - const int hi = input_desc->dims[1]; - const int wi = input_desc->dims[2]; - const int ci = input_desc->dims[3]; - const int ho = offset_desc->dims[1]; - const int wo = offset_desc->dims[2]; - const int co = output_desc->dims[3]; - const int kh = weight_desc->dims[1]; - const int kw = weight_desc->dims[2]; + const int N = input_desc->getDimIndex(0); + const int hi = input_desc->getDimIndex(1); + const int wi = input_desc->getDimIndex(2); + const int ci = input_desc->getDimIndex(3); + const int ho = offset_desc->getDimIndex(1); + const int wo = offset_desc->getDimIndex(2); + const int co = output_desc->getDimIndex(3); + const int kh = weight_desc->getDimIndex(1); + const int kw = weight_desc->getDimIndex(2); const int pt = pad[0]; const int pb = pad[1]; const int pl = pad[2]; @@ -594,12 +594,12 @@ void DcnForwardExecutor::cpuCompute() { cpu_output = cpu_fp32_output_[0]; } - const int ho = offset_desc->dims[1]; - const int wo = offset_desc->dims[2]; - const int kh = weight_desc->dims[1]; - const int kw = weight_desc->dims[2]; - const int ci = input_desc->dims[3]; - const int co = output_desc->dims[3]; + const int ho = offset_desc->getDimIndex(1); + const int wo = offset_desc->getDimIndex(2); + const int kh = weight_desc->getDimIndex(1); + const int kw = weight_desc->getDimIndex(2); + const int ci = input_desc->getDimIndex(3); + const int co = output_desc->getDimIndex(3); size_t cpu_buffer_size = 0; if (g == 1) { @@ -652,15 +652,15 @@ int64_t DcnForwardExecutor::getTheoryOps() { output_desc = tensor_desc_[5].tensor; } - const int N = input_desc->dims[0]; - const int hi = input_desc->dims[1]; - const int wi = input_desc->dims[2]; - const int ci = input_desc->dims[3]; - const int ho = offset_desc->dims[1]; - const int wo = offset_desc->dims[2]; - const int co = output_desc->dims[3]; - const int kh = weight_desc->dims[1]; - const int kw = weight_desc->dims[2]; + const int N = input_desc->getDimIndex(0); + const int hi = input_desc->getDimIndex(1); + const int wi = input_desc->getDimIndex(2); + const int ci = input_desc->getDimIndex(3); + const int ho = offset_desc->getDimIndex(1); + const int wo = offset_desc->getDimIndex(2); + const int co = output_desc->getDimIndex(3); + const int kh = weight_desc->getDimIndex(1); + const int kw = weight_desc->getDimIndex(2); int coeff = getCoefficientOfLT2CT(); const int k = kh * kw * ci / g; const int m = im2col_step * ho * wo; diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/deform_roi_pool_backward/deform_roi_pool_backward.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/deform_roi_pool_backward/deform_roi_pool_backward.cpp index b69975e44..47af02fb2 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/deform_roi_pool_backward/deform_roi_pool_backward.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/deform_roi_pool_backward/deform_roi_pool_backward.cpp @@ -58,11 +58,11 @@ void DeformRoiPoolBackwardExecutor::initData() { grad_input_desc = tensor_desc_[3].tensor; } - batchs = input_desc->dims[0]; - height = input_desc->dims[1]; - width = input_desc->dims[2]; - channels = input_desc->dims[3]; - rois_num = rois_desc->dims[0]; + batchs = input_desc->getDimIndex(0); + height = input_desc->getDimIndex(1); + width = input_desc->getDimIndex(2); + channels = input_desc->getDimIndex(3); + rois_num = rois_desc->getDimIndex(0); // get params auto deform_roi_pool_backward_proto_desc = diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/deform_roi_pool_forward/deform_roi_pool_forward.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/deform_roi_pool_forward/deform_roi_pool_forward.cpp index 8b16e5161..cd4b7f99c 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/deform_roi_pool_forward/deform_roi_pool_forward.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/deform_roi_pool_forward/deform_roi_pool_forward.cpp @@ -52,13 +52,13 @@ void DeformRoiPoolForwardExecutor::initData() { output_desc = tensor_desc_[2].tensor; } - batchs = input_desc->dims[0]; - height = input_desc->dims[1]; - width = input_desc->dims[2]; - channels = input_desc->dims[3]; - rois_num = rois_desc->dims[0]; - pooled_height = output_desc->dims[1]; - pooled_width = output_desc->dims[2]; + batchs = input_desc->getDimIndex(0); + height = input_desc->getDimIndex(1); + width = input_desc->getDimIndex(2); + channels = input_desc->getDimIndex(3); + rois_num = rois_desc->getDimIndex(0); + pooled_height = output_desc->getDimIndex(1); + pooled_width = output_desc->getDimIndex(2); // get params auto deform_roi_pool_forward_proto_desc = parser_->getProtoNode()->deform_roi_pool_forward_param(); diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/diff_iou_rotated_sort_vertices_forward/diff_iou_rotated_sort_vertices_forward.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/diff_iou_rotated_sort_vertices_forward/diff_iou_rotated_sort_vertices_forward.cpp index 41bfbd9e0..1e8fe8a9f 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/diff_iou_rotated_sort_vertices_forward/diff_iou_rotated_sort_vertices_forward.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/diff_iou_rotated_sort_vertices_forward/diff_iou_rotated_sort_vertices_forward.cpp @@ -98,9 +98,9 @@ void DiffIouRotatedSortVerticesForwardExecutor::cpuCompute() { float *data_idx = (float *)cpu_fp32_output_[0]; auto vertices_desc = tensor_desc_[0].tensor; - int dim_b = vertices_desc->dims[0]; - int dim_n = vertices_desc->dims[1]; - int dim_m = vertices_desc->dims[2]; + int dim_b = vertices_desc->getDimIndex(0); + int dim_n = vertices_desc->getDimIndex(1); + int dim_m = vertices_desc->getDimIndex(2); memset(data_idx, 0, dim_b*dim_n*9 * sizeof(int)); for (int bi = 0; bi < dim_b; ++bi) { diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/div/div.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/div/div.cpp index e54ea793a..d2ff90b3a 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/div/div.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/div/div.cpp @@ -63,12 +63,14 @@ void DivExecutor::cpuCompute() { auto c_desc = tensor_desc_[2].tensor; float *a_broadcast = (float *)cpu_runtime_.allocate(count3 * sizeof(float)); float *b_broadcast = (float *)cpu_runtime_.allocate(count3 * sizeof(float)); - expand_compute_cpu(std::vector(a_desc->dims, a_desc->dims + a_desc->dim), - std::vector(c_desc->dims, c_desc->dims + c_desc->dim), - cpu_fp32_input_[0], a_broadcast); - expand_compute_cpu(std::vector(b_desc->dims, b_desc->dims + b_desc->dim), - std::vector(c_desc->dims, c_desc->dims + c_desc->dim), - cpu_fp32_input_[1], b_broadcast); + expand_compute_cpu( + std::vector(a_desc->getDims(), a_desc->getDims() + a_desc->getDim()), + std::vector(c_desc->getDims(), c_desc->getDims() + c_desc->getDim()), + cpu_fp32_input_[0], a_broadcast); + expand_compute_cpu( + std::vector(b_desc->getDims(), b_desc->getDims() + b_desc->getDim()), + std::vector(c_desc->getDims(), c_desc->getDims() + c_desc->getDim()), + cpu_fp32_input_[1], b_broadcast); for (size_t i = 0; i < count3; ++i) { cpu_fp32_output_[0][i] = a_broadcast[i] / b_broadcast[i]; diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/dynamic_point_to_voxel_backward/dynamic_point_to_voxel_backward.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/dynamic_point_to_voxel_backward/dynamic_point_to_voxel_backward.cpp index efc852b34..cc8d1fb04 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/dynamic_point_to_voxel_backward/dynamic_point_to_voxel_backward.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/dynamic_point_to_voxel_backward/dynamic_point_to_voxel_backward.cpp @@ -119,8 +119,8 @@ void DynamicPointToVoxelBackwardExecutor::cpuCompute() { auto feats_desc = tensor_desc_[1].tensor; int M = voxel_num[0]; - int C = feats_desc->dims[1]; - int N = feats_desc->dims[0]; + int C = feats_desc->getDimIndex(1); + int N = feats_desc->getDimIndex(0); VLOG(5) << "M=" << M; VLOG(5) << "C=" << C; VLOG(5) << "N=" << N; diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/dynamic_point_to_voxel_forward/dynamic_point_to_voxel_forward.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/dynamic_point_to_voxel_forward/dynamic_point_to_voxel_forward.cpp index fa65a5b27..baabe1311 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/dynamic_point_to_voxel_forward/dynamic_point_to_voxel_forward.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/dynamic_point_to_voxel_forward/dynamic_point_to_voxel_forward.cpp @@ -117,9 +117,9 @@ void DynamicPointToVoxelForwardExecutor::cpuCompute() { auto coors = cpu_fp32_input_[1]; auto feats_desc = tensor_desc_[0].tensor; auto coors_desc = tensor_desc_[1].tensor; - const int32_t N = coors_desc->dims[0]; - const int32_t num_coors = coors_desc->dims[1]; - const int32_t num_features = feats_desc->dims[1]; + const int32_t N = coors_desc->getDimIndex(0); + const int32_t num_coors = coors_desc->getDimIndex(1); + const int32_t num_features = feats_desc->getDimIndex(1); // Get output auto voxel_feats = cpu_fp32_output_[0]; @@ -224,7 +224,8 @@ void DynamicPointToVoxelForwardExecutor::cpuCompute() { // 5. Calculate voxel_feats const float fill_value = reduce_mode == REDUCE_MODE_MAX ? -1.17549e038 : 0x0; - for (int32_t i = 0; i < voxel_feats_desc->dims[0] * num_features; ++i) { + for (int32_t i = 0; i < voxel_feats_desc->getDimIndex(0) * num_features; + ++i) { voxel_feats[i] = fill_value; } diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/fft/fft.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/fft/fft.cpp index a00301e18..0deca3389 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/fft/fft.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/fft/fft.cpp @@ -92,8 +92,8 @@ int64_t FftExecutor::getTheoryOps() { auto fft_param = parser_->getProtoNode()->fft_param(); int rank = fft_param.rank(); int bc = 1; - if (input_tensor->dim != rank) { - bc = input_tensor->dims[0]; + if (input_tensor->getDim() != rank) { + bc = input_tensor->getDimIndex(0); } int n = fft_param.n(0); @@ -125,8 +125,8 @@ int64_t FftExecutor::getTheoryIoSize() { auto fft_param = parser_->getProtoNode()->fft_param(); int rank = fft_param.rank(); int bc = 1; - if (input_tensor->dim != rank) { - bc = input_tensor->dims[0]; + if (input_tensor->getDim() != rank) { + bc = input_tensor->getDimIndex(0); } int n = fft_param.n(0); diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/generate_proposals_v2/generate_proposals_v2.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/generate_proposals_v2/generate_proposals_v2.cpp index da8a3d91a..d75cd7605 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/generate_proposals_v2/generate_proposals_v2.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/generate_proposals_v2/generate_proposals_v2.cpp @@ -156,10 +156,10 @@ void GenerateProposalsV2Executor::cpuCompute() { auto tensor_scores = parser_->getMetaTensor("input1").tensor; - const int N = tensor_scores->dims[0]; - const int H = tensor_scores->dims[1]; - const int W = tensor_scores->dims[2]; - const int A = tensor_scores->dims[3]; + const int N = tensor_scores->getDimIndex(0); + const int H = tensor_scores->getDimIndex(1); + const int W = tensor_scores->getDimIndex(2); + const int A = tensor_scores->getDimIndex(3); auto scores_ptr = parser_->getMetaTensor("input1").cpu_ptr; auto deltas_ptr = parser_->getMetaTensor("input2").cpu_ptr; @@ -183,12 +183,12 @@ void GenerateProposalsV2Executor::cpuCompute() { int64_t GenerateProposalsV2Executor::getTheoryOps() { VLOG(4) << "getTheoryOps"; - // int dims = parser_->getMetaTensor("input1").tensor->dims[0]; + // int dims = parser_->getMetaTensor("input1").tensor->getDimIndex(0); auto tensor_scores = parser_->getMetaTensor("input1").tensor; - const int N = tensor_scores->dims[0]; - const int H = tensor_scores->dims[1]; - const int W = tensor_scores->dims[2]; - const int A = tensor_scores->dims[3]; + const int N = tensor_scores->getDimIndex(0); + const int H = tensor_scores->getDimIndex(1); + const int W = tensor_scores->getDimIndex(2); + const int A = tensor_scores->getDimIndex(3); int64_t theory_ops = 39 * N * A * H * W; VLOG(4) << "getTheoryOps: " << theory_ops << " ops"; return theory_ops; diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/get_indice_pairs/get_indice_pairs.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/get_indice_pairs/get_indice_pairs.cpp index f86bd82a3..51e2bcafd 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/get_indice_pairs/get_indice_pairs.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/get_indice_pairs/get_indice_pairs.cpp @@ -139,7 +139,7 @@ void GetIndicePairsExecutor::castIn() { void GetIndicePairsExecutor::diffPreprocess() { float *cpu_input = (float *)cpu_fp32_output_[1]; - int32_t input_active_in = indice_pairs_desc_->dims[2]; + int32_t input_active_in = indice_pairs_desc_->getDimIndex(2); int32_t kernel_volume = 1; for (int i = 0; i < filter_space_.size(); i++) { kernel_volume *= filter_space_[i]; @@ -294,7 +294,7 @@ void GetIndicePairsExecutor::cpuGetIndicePairs( std::vector out_spatail_shape, const int32_t dimNb, const int32_t sub_m, const int32_t batch_size) { int32_t num_act = 0; - int32_t num_act_in = indice_in_desc->dims[0]; + int32_t num_act_in = indice_in_desc->getDimIndex(0); int32_t batch_idx = 0; int32_t spatail_volume = 1; int32_t NDim = dimNb - 2; @@ -406,8 +406,8 @@ void GetIndicePairsExecutor::cpuGetIndicePairs( } int64_t GetIndicePairsExecutor::getTheoryOps() { - int64_t kernel_volume = indice_pairs_desc_->dims[0]; - int64_t active_input_in = indice_pairs_desc_->dims[2]; + int64_t kernel_volume = indice_pairs_desc_->getDimIndex(0); + int64_t active_input_in = indice_pairs_desc_->getDimIndex(2); int64_t dims = dimNb_ - 2 + 1; int64_t total_op_size = 0; int64_t kernel1_op_size = 0, kernel2_op_size = 0, kernel3_op_size = 0, @@ -450,8 +450,8 @@ int64_t GetIndicePairsExecutor::getTheoryOps() { } int64_t GetIndicePairsExecutor::getTheoryIoSize() { - int64_t kernel_volume = indice_pairs_desc_->dims[0]; - int64_t active_input_in = indice_pairs_desc_->dims[2]; + int64_t kernel_volume = indice_pairs_desc_->getDimIndex(0); + int64_t active_input_in = indice_pairs_desc_->getDimIndex(2); int64_t dims = dimNb_ - 2 + 1; int64_t total_io_size = 0; int64_t kernel1_io_size = 0, kernel2_io_size = 0, kernel3_io_size = 0, diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/indice_convolution_backward_data/indice_convolution_backward_data.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/indice_convolution_backward_data/indice_convolution_backward_data.cpp index 2e3aa4930..1fbf88242 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/indice_convolution_backward_data/indice_convolution_backward_data.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/indice_convolution_backward_data/indice_convolution_backward_data.cpp @@ -30,44 +30,44 @@ namespace mluoptest { void IndiceConvolutionBackwardDataExecutor::getFilterDims() { const mluOpTensorDescriptor_t filters_desc = tensor_desc_[1].tensor; - const mluOpTensorLayout_t layout = filters_desc->layout; + const mluOpTensorLayout_t layout = filters_desc->getLayout(); kd = 1; filter_4d = true; if (layout == MLUOP_LAYOUT_NCHW) { - dyc = filters_desc->dims[0]; - dxc = filters_desc->dims[1]; - kh = filters_desc->dims[2]; - kw = filters_desc->dims[3]; + dyc = filters_desc->getDimIndex(0); + dxc = filters_desc->getDimIndex(1); + kh = filters_desc->getDimIndex(2); + kw = filters_desc->getDimIndex(3); } else if (layout == MLUOP_LAYOUT_NHWC) { - dyc = filters_desc->dims[0]; - dxc = filters_desc->dims[3]; - kh = filters_desc->dims[1]; - kw = filters_desc->dims[2]; + dyc = filters_desc->getDimIndex(0); + dxc = filters_desc->getDimIndex(3); + kh = filters_desc->getDimIndex(1); + kw = filters_desc->getDimIndex(2); } else if (layout == MLUOP_LAYOUT_HWCN) { - dyc = filters_desc->dims[3]; - dxc = filters_desc->dims[2]; - kh = filters_desc->dims[0]; - kw = filters_desc->dims[1]; + dyc = filters_desc->getDimIndex(3); + dxc = filters_desc->getDimIndex(2); + kh = filters_desc->getDimIndex(0); + kw = filters_desc->getDimIndex(1); } else if (layout == MLUOP_LAYOUT_NDHWC) { - dyc = filters_desc->dims[0]; - dxc = filters_desc->dims[4]; - kd = filters_desc->dims[1]; - kh = filters_desc->dims[2]; - kw = filters_desc->dims[3]; + dyc = filters_desc->getDimIndex(0); + dxc = filters_desc->getDimIndex(4); + kd = filters_desc->getDimIndex(1); + kh = filters_desc->getDimIndex(2); + kw = filters_desc->getDimIndex(3); filter_4d = false; } else if (layout == MLUOP_LAYOUT_NCDHW) { - dyc = filters_desc->dims[0]; - dxc = filters_desc->dims[1]; - kd = filters_desc->dims[2]; - kh = filters_desc->dims[3]; - kw = filters_desc->dims[4]; + dyc = filters_desc->getDimIndex(0); + dxc = filters_desc->getDimIndex(1); + kd = filters_desc->getDimIndex(2); + kh = filters_desc->getDimIndex(3); + kw = filters_desc->getDimIndex(4); filter_4d = false; } else if (layout == MLUOP_LAYOUT_ARRAY) { - dyc = filters_desc->dims[4]; - dxc = filters_desc->dims[3]; - kd = filters_desc->dims[0]; - kh = filters_desc->dims[1]; - kw = filters_desc->dims[2]; + dyc = filters_desc->getDimIndex(4); + dxc = filters_desc->getDimIndex(3); + kd = filters_desc->getDimIndex(0); + kh = filters_desc->getDimIndex(1); + kw = filters_desc->getDimIndex(2); filter_4d = false; } } @@ -262,7 +262,7 @@ void IndiceConvolutionBackwardDataExecutor::cpuCompute() { int K = kd * kh * kw; int filter_num = K * dyc * dxc; const mluOpTensorDescriptor_t filters_desc = tensor_desc_[1].tensor; - const mluOpTensorLayout_t layout = filters_desc->layout; + const mluOpTensorLayout_t layout = filters_desc->getLayout(); float *filter_transpose_cpu; if (!(layout == MLUOP_LAYOUT_HWCN)) { filter_transpose_cpu = @@ -282,17 +282,18 @@ void IndiceConvolutionBackwardDataExecutor::cpuCompute() { // get index pair param const mluOpTensorDescriptor_t indice_pairs_desc = tensor_desc_[2].tensor; - int L = indice_pairs_desc->dims[2]; + int L = indice_pairs_desc->getDimIndex(2); // main calculation // set input data to 0 int input_grad_data_count = parser_->getOutputDataCount(0); memset(cpu_fp32_output_[0], 0x00, - mluOpDataTypeBytes(indice_pairs_desc->dtype) * input_grad_data_count); + mluOpDataTypeBytes(indice_pairs_desc->getDtype()) * + input_grad_data_count); float *output_grad = cpu_fp32_input_[0]; float *indice_pairs = cpu_fp32_input_[2]; float *input_grad = cpu_fp32_output_[0]; - bool is_float = (filters_desc->dtype == MLUOP_DTYPE_FLOAT); + bool is_float = (filters_desc->getDtype() == MLUOP_DTYPE_FLOAT); for (int i = 0; i < input_grad_data_count; ++i) { input_grad[i] = 0; } diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/indice_convolution_backward_filter/indice_convolution_backward_filter.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/indice_convolution_backward_filter/indice_convolution_backward_filter.cpp index e38c7e602..970d5b1cc 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/indice_convolution_backward_filter/indice_convolution_backward_filter.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/indice_convolution_backward_filter/indice_convolution_backward_filter.cpp @@ -42,7 +42,7 @@ void IndiceConvolutionBackwardFilterExecutor::initParam() { subm_ = op_param.sub_m(); diffw_trans_ = false; - // if (MLUOP_LAYOUT_HWCN != diffw_desc_->layout) { + // if (MLUOP_LAYOUT_HWCN != diffw_desc_->getLayout()) { // diffw_trans_ = true; // } } @@ -131,14 +131,14 @@ void IndiceConvolutionBackwardFilterExecutor::cpuCompute() { } if (diffw_trans_) { - temp_diffw = (float *)cpu_runtime_.allocate(diffw_desc_->total_element_num * - sizeof(float)); + temp_diffw = (float *)cpu_runtime_.allocate( + diffw_desc_->getTotalElementNum() * sizeof(float)); } - int64_t in_active_num = input_indice_desc_->dims[0]; - int64_t ci = input_indice_desc_->dims[1]; - int64_t co = diffy_indice_desc_->dims[1]; - int64_t kd = diffw_desc_->dim == 4 ? 1 : mluOpGetTensordimD(diffw_desc_); + int64_t in_active_num = input_indice_desc_->getDimIndex(0); + int64_t ci = input_indice_desc_->getDimIndex(1); + int64_t co = diffy_indice_desc_->getDimIndex(1); + int64_t kd = diffw_desc_->getDim() == 4 ? 1 : mluOpGetTensordimD(diffw_desc_); int64_t kh = mluOpGetTensordimH(diffw_desc_); int64_t kw = mluOpGetTensordimH(diffw_desc_); int64_t kernel_volume = kd * kh * kw; @@ -169,7 +169,8 @@ void IndiceConvolutionBackwardFilterExecutor::cpuCompute() { } // trans if (diffw_trans_) { - cpuTranspose(diffw, temp_diffw, kernel_volume, ci, co, diffw_desc_->layout); + cpuTranspose(diffw, temp_diffw, kernel_volume, ci, co, + diffw_desc_->getLayout()); cpu_runtime_.deallocate(temp_diffw); } @@ -177,13 +178,13 @@ void IndiceConvolutionBackwardFilterExecutor::cpuCompute() { } int64_t IndiceConvolutionBackwardFilterExecutor::getTheoryOps() { - int64_t ci = input_indice_desc_->dims[1]; - int64_t co = diffy_indice_desc_->dims[1]; - int64_t kernel_volume = indice_pair_desc_->dims[0]; + int64_t ci = input_indice_desc_->getDimIndex(1); + int64_t co = diffy_indice_desc_->getDimIndex(1); + int64_t kernel_volume = indice_pair_desc_->getDimIndex(0); int64_t total_ops = 0; // fill theory ops - total_ops += diffw_desc_->total_tensor_size; + total_ops += diffw_desc_->getTotalTensorSize(); for (int64_t i = 0; i < kernel_volume; ++i) { if (indice_num_[0] <= 0) { continue; @@ -194,27 +195,27 @@ int64_t IndiceConvolutionBackwardFilterExecutor::getTheoryOps() { } // transpose theory ops if (diffw_trans_) { - total_ops += diffw_desc_->total_element_num; + total_ops += diffw_desc_->getTotalElementNum(); } return total_ops; } int64_t IndiceConvolutionBackwardFilterExecutor::getTheoryIoSize() { int32_t *indice_pair = (int32_t *)(data_vector_[2].host_ptr); - int64_t ci = input_indice_desc_->dims[1]; - int64_t co = diffy_indice_desc_->dims[1]; - int64_t in_active_num = input_indice_desc_->dims[0]; - int64_t kernel_volume = indice_pair_desc_->dims[0]; + int64_t ci = input_indice_desc_->getDimIndex(1); + int64_t co = diffy_indice_desc_->getDimIndex(1); + int64_t in_active_num = input_indice_desc_->getDimIndex(0); + int64_t kernel_volume = indice_pair_desc_->getDimIndex(0); int64_t theory_ios = 0; size_t input_indice_dwidth, diffy_indice_dwidth, indice_pair_dwidth, diffw_dwidth; - MLUOP_CHECK( - mluOpGetSizeOfDataType(input_indice_desc_->dtype, &input_indice_dwidth)); - MLUOP_CHECK( - mluOpGetSizeOfDataType(diffy_indice_desc_->dtype, &diffy_indice_dwidth)); - MLUOP_CHECK( - mluOpGetSizeOfDataType(indice_pair_desc_->dtype, &indice_pair_dwidth)); - MLUOP_CHECK(mluOpGetSizeOfDataType(diffw_desc_->dtype, &diffw_dwidth)); + MLUOP_CHECK(mluOpGetSizeOfDataType(input_indice_desc_->getDtype(), + &input_indice_dwidth)); + MLUOP_CHECK(mluOpGetSizeOfDataType(diffy_indice_desc_->getDtype(), + &diffy_indice_dwidth)); + MLUOP_CHECK(mluOpGetSizeOfDataType(indice_pair_desc_->getDtype(), + &indice_pair_dwidth)); + MLUOP_CHECK(mluOpGetSizeOfDataType(diffw_desc_->getDtype(), &diffw_dwidth)); auto gather_nd_ios = [&](const int64_t kernel_index, const int64_t gather_num, const int64_t channel, @@ -237,7 +238,7 @@ int64_t IndiceConvolutionBackwardFilterExecutor::getTheoryIoSize() { }; // fill theory ios - theory_ios += diffw_desc_->total_tensor_size; + theory_ios += diffw_desc_->getTotalTensorSize(); for (int64_t i = 0; i < kernel_volume; ++i) { if (indice_num_[i] <= 0) { @@ -254,7 +255,7 @@ int64_t IndiceConvolutionBackwardFilterExecutor::getTheoryIoSize() { } // transpose theory ios if (diffw_trans_) { - theory_ios += diffw_desc_->total_tensor_size * 2; + theory_ios += diffw_desc_->getTotalTensorSize() * 2; } return theory_ios; diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/indice_convolution_forward/indice_convolution_forward.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/indice_convolution_forward/indice_convolution_forward.cpp index 61a80277e..036b0d238 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/indice_convolution_forward/indice_convolution_forward.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/indice_convolution_forward/indice_convolution_forward.cpp @@ -95,10 +95,11 @@ void IndiceConvolutionForwardExecutor::cpuCompute() { return; // skip zero element } - int64_t num_active_in = features_desc_->dims[0]; - int64_t ci = features_desc_->dims[1]; - int64_t co = features_out_desc_->dims[1]; - bool filters_need_transpose = filters_desc_->layout != MLUOP_LAYOUT_ARRAY; + int64_t num_active_in = features_desc_->getDimIndex(0); + int64_t ci = features_desc_->getDimIndex(1); + int64_t co = features_out_desc_->getDimIndex(1); + bool filters_need_transpose = + filters_desc_->getLayout() != MLUOP_LAYOUT_ARRAY; int64_t kd = 0; int64_t kh = 0; int64_t kw = 0; @@ -107,9 +108,9 @@ void IndiceConvolutionForwardExecutor::cpuCompute() { kh = mluOpGetTensordimH(filters_desc_); kw = mluOpGetTensordimW(filters_desc_); } else { - kd = filters_desc_->dims[0]; - kh = filters_desc_->dims[1]; - kw = filters_desc_->dims[2]; + kd = filters_desc_->getDimIndex(0); + kh = filters_desc_->getDimIndex(1); + kw = filters_desc_->getDimIndex(2); } int64_t num_filters = kd * kh * kw; @@ -117,12 +118,12 @@ void IndiceConvolutionForwardExecutor::cpuCompute() { float *filters_transed = filters; if (filters_need_transpose) { filters_transed = (float *)cpu_runtime_.allocate( - filters_desc_->total_element_num * sizeof(float)); - if (filters_desc_->layout == MLUOP_LAYOUT_NCDHW) { + filters_desc_->getTotalElementNum() * sizeof(float)); + if (filters_desc_->getLayout() == MLUOP_LAYOUT_NCDHW) { stride[0] = 1; stride[1] = num_filters; stride[2] = num_filters * ci; - } else if (filters_desc_->layout == MLUOP_LAYOUT_NDHWC) { + } else if (filters_desc_->getLayout() == MLUOP_LAYOUT_NDHWC) { stride[0] = ci; stride[1] = 1; stride[2] = ci * num_filters; @@ -140,9 +141,9 @@ void IndiceConvolutionForwardExecutor::cpuCompute() { } int32_t features_out_data_count = parser_->getOutputDataCount(0); - memset( - cpu_fp32_output_[0], 0x00, - mluOpDataTypeBytes(features_out_desc_->dtype) * features_out_data_count); + memset(cpu_fp32_output_[0], 0x00, + mluOpDataTypeBytes(features_out_desc_->getDtype()) * + features_out_data_count); for (int64_t kdi = 0; kdi < kd; ++kdi) { for (int64_t khi = 0; khi < kh; ++khi) { @@ -175,13 +176,13 @@ void IndiceConvolutionForwardExecutor::cpuCompute() { } int64_t IndiceConvolutionForwardExecutor::getTheoryOps() { - int64_t ci = features_desc_->dims[1]; - int64_t co = features_out_desc_->dims[1]; - int64_t num_filters = indice_pairs_desc_->dims[0]; + int64_t ci = features_desc_->getDimIndex(1); + int64_t co = features_out_desc_->getDimIndex(1); + int64_t num_filters = indice_pairs_desc_->getDimIndex(0); int64_t total_ops = 0; // initialize output to 0 - total_ops += features_out_desc_->total_element_num; + total_ops += features_out_desc_->getTotalElementNum(); for (int64_t i = 0; i < num_filters; ++i) { if (indice_num_[i] < 0) { continue; @@ -196,27 +197,29 @@ int64_t IndiceConvolutionForwardExecutor::getTheoryOps() { // transpose filters ops bool filters_need_transpose = true; if (filters_need_transpose) { - total_ops += filters_desc_->total_element_num; + total_ops += filters_desc_->getTotalElementNum(); } return total_ops; } int64_t IndiceConvolutionForwardExecutor::getTheoryIoSize() { int32_t *indice_pair = (int32_t *)(data_vector_[2].host_ptr); - int64_t ci = features_desc_->dims[1]; - int64_t co = features_out_desc_->dims[2]; - int64_t num_active_in = features_desc_->dims[0]; - int64_t num_active_out = features_out_desc_->dims[0]; - int64_t num_filters = indice_pairs_desc_->dims[0]; + int64_t ci = features_desc_->getDimIndex(1); + int64_t co = features_out_desc_->getDimIndex(2); + int64_t num_active_in = features_desc_->getDimIndex(0); + int64_t num_active_out = features_out_desc_->getDimIndex(0); + int64_t num_filters = indice_pairs_desc_->getDimIndex(0); int64_t theory_ios = 0; size_t features_dwidth, filters_dwidth, indice_pairs_dwith, features_out_dwith; - MLUOP_CHECK(mluOpGetSizeOfDataType(features_desc_->dtype, &features_dwidth)); - MLUOP_CHECK(mluOpGetSizeOfDataType(filters_desc_->dtype, &filters_dwidth)); MLUOP_CHECK( - mluOpGetSizeOfDataType(indice_pairs_desc_->dtype, &indice_pairs_dwith)); + mluOpGetSizeOfDataType(features_desc_->getDtype(), &features_dwidth)); MLUOP_CHECK( - mluOpGetSizeOfDataType(features_out_desc_->dtype, &features_out_dwith)); + mluOpGetSizeOfDataType(filters_desc_->getDtype(), &filters_dwidth)); + MLUOP_CHECK(mluOpGetSizeOfDataType(indice_pairs_desc_->getDtype(), + &indice_pairs_dwith)); + MLUOP_CHECK(mluOpGetSizeOfDataType(features_out_desc_->getDtype(), + &features_out_dwith)); auto gather_scatter_ios = [&](const int64_t index, const int64_t num, const int64_t channel, @@ -234,10 +237,10 @@ int64_t IndiceConvolutionForwardExecutor::getTheoryIoSize() { }; // fill ios - theory_ios += filters_desc_->total_tensor_size; + theory_ios += filters_desc_->getTotalTensorSize(); // transpose ios - theory_ios += filters_desc_->total_element_num * 2; + theory_ios += filters_desc_->getTotalElementNum() * 2; for (int64_t i = 0; i < num_filters; ++i) { if (indice_num_[i] <= 0) { diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/logspace/logspace.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/logspace/logspace.cpp index cbf8b12a0..6661129e7 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/logspace/logspace.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/logspace/logspace.cpp @@ -57,7 +57,7 @@ void LogspaceExecutor::cpuCompute() { auto count = parser_->output(0)->shape_count; float step = (end_num_ - start_num_) / (steps_num_ - 1); - switch (tensor_desc_[1].tensor->dtype) { + switch (tensor_desc_[1].tensor->getDtype()) { case MLUOP_DTYPE_FLOAT: { for (int i = 0; i < count; ++i) { cpu_fp32_output_[0][i] = ::powf(base_num_, start_num_ + step * i); diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/masked_col2im_forward/masked_col2im_forward.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/masked_col2im_forward/masked_col2im_forward.cpp index e1cdb18e1..d5d08a726 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/masked_col2im_forward/masked_col2im_forward.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/masked_col2im_forward/masked_col2im_forward.cpp @@ -43,11 +43,11 @@ void MaskedCol2imForwardExecutor::paramCheck() { void MaskedCol2imForwardExecutor::init() { auto col_desc = tensor_desc_[0].tensor; auto im_desc = tensor_desc_[3].tensor; - batchs_ = im_desc->dims[0]; - channels_ = im_desc->dims[1]; - height_ = im_desc->dims[2]; - width_ = im_desc->dims[3]; - mask_cnt_ = col_desc->dims[1]; + batchs_ = im_desc->getDimIndex(0); + channels_ = im_desc->getDimIndex(1); + height_ = im_desc->getDimIndex(2); + width_ = im_desc->getDimIndex(3); + mask_cnt_ = col_desc->getDimIndex(1); } void MaskedCol2imForwardExecutor::workspaceMalloc() { diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/masked_im2col_forward/masked_im2col_forward.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/masked_im2col_forward/masked_im2col_forward.cpp index 7f4ce97b3..84271721f 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/masked_im2col_forward/masked_im2col_forward.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/masked_im2col_forward/masked_im2col_forward.cpp @@ -47,11 +47,11 @@ void MaskedIm2colForwardExecutor::paramCheck() { void MaskedIm2colForwardExecutor::init() { auto input_desc = tensor_desc_[0].tensor; auto mask_desc = tensor_desc_[1].tensor; - batchs_ = input_desc->dims[0]; - channels_ = input_desc->dims[1]; - height_ = input_desc->dims[2]; - width_ = input_desc->dims[3]; - mask_cnt_ = mask_desc->dims[0]; + batchs_ = input_desc->getDimIndex(0); + channels_ = input_desc->getDimIndex(1); + height_ = input_desc->getDimIndex(2); + width_ = input_desc->getDimIndex(3); + mask_cnt_ = mask_desc->getDimIndex(0); auto masked_im2col_forward_proto_desc = parser_->getProtoNode()->masked_im2col_forward_param(); kernel_h = masked_im2col_forward_proto_desc.kernel_h(); diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/moe_dispatch_backward_data/moe_dispatch_backward_data.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/moe_dispatch_backward_data/moe_dispatch_backward_data.cpp index d605d83b4..fcd0804a2 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/moe_dispatch_backward_data/moe_dispatch_backward_data.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/moe_dispatch_backward_data/moe_dispatch_backward_data.cpp @@ -140,13 +140,15 @@ int64_t MoeDispatchBackwardDataExecutor::getTheoryOps() { int64_t MoeDispatchBackwardDataExecutor::getTheoryIoSize() { size_t gates_dwidth, indices_dwidth, locations_dwidth, dispatch_dwidth, grad_input_dwidth; - MLUOP_CHECK(mluOpGetSizeOfDataType(desc_gates_->dtype, &gates_dwidth)); - MLUOP_CHECK(mluOpGetSizeOfDataType(desc_indices_->dtype, &indices_dwidth)); + MLUOP_CHECK(mluOpGetSizeOfDataType(desc_gates_->getDtype(), &gates_dwidth)); MLUOP_CHECK( - mluOpGetSizeOfDataType(desc_locations_->dtype, &locations_dwidth)); - MLUOP_CHECK(mluOpGetSizeOfDataType(desc_dispatch_->dtype, &dispatch_dwidth)); + mluOpGetSizeOfDataType(desc_indices_->getDtype(), &indices_dwidth)); MLUOP_CHECK( - mluOpGetSizeOfDataType(desc_grad_input_->dtype, &grad_input_dwidth)); + mluOpGetSizeOfDataType(desc_locations_->getDtype(), &locations_dwidth)); + MLUOP_CHECK( + mluOpGetSizeOfDataType(desc_dispatch_->getDtype(), &dispatch_dwidth)); + MLUOP_CHECK( + mluOpGetSizeOfDataType(desc_grad_input_->getDtype(), &grad_input_dwidth)); int64_t gates_theory_ios = samples_mask_num_ * gates_dwidth; int64_t indices_theory_ios = samples_mask_num_ * indices_dwidth; diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/moe_dispatch_forward/moe_dispatch_forward.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/moe_dispatch_forward/moe_dispatch_forward.cpp index cea466c02..f27e703e5 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/moe_dispatch_forward/moe_dispatch_forward.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/moe_dispatch_forward/moe_dispatch_forward.cpp @@ -115,12 +115,14 @@ int64_t MoeDispatchForwardExecutor::getTheoryOps() { int64_t MoeDispatchForwardExecutor::getTheoryIoSize() { size_t gates_dwidth, indices_dwidth, locations_dwidth, input_dwidth, dispatch_dwidth; - MLUOP_CHECK(mluOpGetSizeOfDataType(desc_gates_->dtype, &gates_dwidth)); - MLUOP_CHECK(mluOpGetSizeOfDataType(desc_indices_->dtype, &indices_dwidth)); + MLUOP_CHECK(mluOpGetSizeOfDataType(desc_gates_->getDtype(), &gates_dwidth)); MLUOP_CHECK( - mluOpGetSizeOfDataType(desc_locations_->dtype, &locations_dwidth)); - MLUOP_CHECK(mluOpGetSizeOfDataType(desc_input_->dtype, &input_dwidth)); - MLUOP_CHECK(mluOpGetSizeOfDataType(desc_input_->dtype, &dispatch_dwidth)); + mluOpGetSizeOfDataType(desc_indices_->getDtype(), &indices_dwidth)); + MLUOP_CHECK( + mluOpGetSizeOfDataType(desc_locations_->getDtype(), &locations_dwidth)); + MLUOP_CHECK(mluOpGetSizeOfDataType(desc_input_->getDtype(), &input_dwidth)); + MLUOP_CHECK( + mluOpGetSizeOfDataType(desc_input_->getDtype(), &dispatch_dwidth)); int64_t gates_theory_ios = samples_ * gates_dwidth; int64_t indices_theory_ios = samples_ * indices_dwidth; diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/ms_deform_attn_backward/ms_deform_attn_backward.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/ms_deform_attn_backward/ms_deform_attn_backward.cpp index b42aaa096..693591ab4 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/ms_deform_attn_backward/ms_deform_attn_backward.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/ms_deform_attn_backward/ms_deform_attn_backward.cpp @@ -152,15 +152,15 @@ void MsDeformAttnBackwardExecutor::cpuCompute() { mluOpTensorDescriptor_t value_desc = tensor_desc_[0].tensor; mluOpTensorDescriptor_t sampling_loc_desc = tensor_desc_[3].tensor; - const int32_t batch = value_desc->dims[0]; - const int32_t channels = value_desc->dims[3]; + const int32_t batch = value_desc->getDimIndex(0); + const int32_t channels = value_desc->getDimIndex(3); - const int32_t num_query = sampling_loc_desc->dims[1]; - const int32_t num_heads = sampling_loc_desc->dims[2]; - const int32_t num_levels = sampling_loc_desc->dims[3]; - const int32_t num_point = sampling_loc_desc->dims[4]; + const int32_t num_query = sampling_loc_desc->getDimIndex(1); + const int32_t num_heads = sampling_loc_desc->getDimIndex(2); + const int32_t num_levels = sampling_loc_desc->getDimIndex(3); + const int32_t num_point = sampling_loc_desc->getDimIndex(4); const int32_t qid_stride = num_heads * channels; - const int32_t spatial_size = value_desc->dims[1]; + const int32_t spatial_size = value_desc->getDimIndex(1); const int32_t grad_weight_stride = 1; const int32_t grad_loc_stride = 2; @@ -226,12 +226,12 @@ int64_t MsDeformAttnBackwardExecutor::getTheoryOps() { auto grad_value_desc = tensor_desc_[6].tensor; auto grad_sampling_loc_desc = tensor_desc_[7].tensor; - const int32_t batch = grad_value_desc->dims[0]; - const int32_t channels = grad_value_desc->dims[3]; - const int32_t num_query = grad_sampling_loc_desc->dims[1]; - const int32_t num_heads = grad_sampling_loc_desc->dims[2]; - const int32_t num_levels = grad_sampling_loc_desc->dims[3]; - const int32_t num_point = grad_sampling_loc_desc->dims[4]; + const int32_t batch = grad_value_desc->getDimIndex(0); + const int32_t channels = grad_value_desc->getDimIndex(3); + const int32_t num_query = grad_sampling_loc_desc->getDimIndex(1); + const int32_t num_heads = grad_sampling_loc_desc->getDimIndex(2); + const int32_t num_levels = grad_sampling_loc_desc->getDimIndex(3); + const int32_t num_point = grad_sampling_loc_desc->getDimIndex(4); const int64_t count = 48; const int64_t theory_ops = diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/ms_deform_attn_forward/ms_deform_attn_forward.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/ms_deform_attn_forward/ms_deform_attn_forward.cpp index 351717e18..cb1bfe02a 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/ms_deform_attn_forward/ms_deform_attn_forward.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/ms_deform_attn_forward/ms_deform_attn_forward.cpp @@ -184,13 +184,13 @@ void MsDeformAttnForwardExecutor::cpuCompute() { auto tensor_data_sampling_loc = tensor_desc_[3].tensor; auto tensor_data_attn_weight = tensor_desc_[4].tensor; auto tensor_data_col = tensor_desc_[5].tensor; - int batch_size = tensor_data_value->dims[0]; - int num_keys = tensor_data_value->dims[1]; - int num_heads = tensor_data_value->dims[2]; - int channels = tensor_data_value->dims[3]; - int num_levels = tensor_data_spatial_shapes->dims[0]; - int num_query = tensor_data_sampling_loc->dims[1]; - int num_point = tensor_data_sampling_loc->dims[4]; + int batch_size = tensor_data_value->getDimIndex(0); + int num_keys = tensor_data_value->getDimIndex(1); + int num_heads = tensor_data_value->getDimIndex(2); + int channels = tensor_data_value->getDimIndex(3); + int num_levels = tensor_data_spatial_shapes->getDimIndex(0); + int num_query = tensor_data_sampling_loc->getDimIndex(1); + int num_point = tensor_data_sampling_loc->getDimIndex(4); auto data_value = cpu_fp32_input_[0]; auto data_spatial_shapes = cpu_fp32_input_[1]; auto data_level_start_index = cpu_fp32_input_[2]; @@ -214,12 +214,12 @@ int64_t MsDeformAttnForwardExecutor::getTheoryIoSize() { auto tensor_data_value = tensor_desc_[0].tensor; auto tensor_data_spatial_shapes = tensor_desc_[1].tensor; auto tensor_data_sampling_loc = tensor_desc_[3].tensor; - size_t batch_size = tensor_data_value->dims[0]; - size_t num_heads = tensor_data_value->dims[2]; - size_t channels = tensor_data_value->dims[3]; - size_t num_levels = tensor_data_spatial_shapes->dims[0]; - size_t num_query = tensor_data_sampling_loc->dims[1]; - size_t num_point = tensor_data_sampling_loc->dims[4]; + size_t batch_size = tensor_data_value->getDimIndex(0); + size_t num_heads = tensor_data_value->getDimIndex(2); + size_t channels = tensor_data_value->getDimIndex(3); + size_t num_levels = tensor_data_spatial_shapes->getDimIndex(0); + size_t num_query = tensor_data_sampling_loc->getDimIndex(1); + size_t num_point = tensor_data_sampling_loc->getDimIndex(4); size_t total_size = 0; total_size += 4 * batch_size * num_query * num_heads * num_levels * num_point * channels * @@ -240,11 +240,11 @@ int64_t MsDeformAttnForwardExecutor::getTheoryOps() { auto tensor_data_value = tensor_desc_[0].tensor; auto tensor_data_spatial_shapes = tensor_desc_[1].tensor; auto tensor_data_sampling_loc = tensor_desc_[3].tensor; - size_t batch_size = tensor_data_value->dims[0]; - size_t num_heads = tensor_data_value->dims[2]; - size_t num_levels = tensor_data_spatial_shapes->dims[0]; - size_t num_query = tensor_data_sampling_loc->dims[1]; - size_t num_point = tensor_data_sampling_loc->dims[4]; + size_t batch_size = tensor_data_value->getDimIndex(0); + size_t num_heads = tensor_data_value->getDimIndex(2); + size_t num_levels = tensor_data_spatial_shapes->getDimIndex(0); + size_t num_query = tensor_data_sampling_loc->getDimIndex(1); + size_t num_point = tensor_data_sampling_loc->getDimIndex(4); int64_t count = 11; int64_t theory_ops = batch_size * num_query * num_heads * num_levels * num_point * count; diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/mutual_information_backward/mutual_information_backward.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/mutual_information_backward/mutual_information_backward.cpp index 7555d9010..751bbb231 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/mutual_information_backward/mutual_information_backward.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/mutual_information_backward/mutual_information_backward.cpp @@ -49,9 +49,9 @@ void MutualInformationBackwardExecutor::initParam() { host_ans_grad_in = (float *)data_vector_[3].host_ptr; } - B_ = px_desc_->dims[0]; - S_ = px_desc_->dims[1]; - T_ = py_desc_->dims[2]; + B_ = px_desc_->getDimIndex(0); + S_ = px_desc_->getDimIndex(1); + T_ = py_desc_->getDimIndex(2); px_index_ = Index3D(S_, T_ + 1); py_index_ = Index3D(S_ + 1, T_); diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/mutual_information_forward/mutual_information_forward.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/mutual_information_forward/mutual_information_forward.cpp index cb8bd68f6..785932959 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/mutual_information_forward/mutual_information_forward.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/mutual_information_forward/mutual_information_forward.cpp @@ -41,9 +41,9 @@ void MutualInformationForwardExecutor::initParam() { host_p_in = (float *)data_vector_[2].host_ptr; } - B_ = px_desc_->dims[0]; - S_ = px_desc_->dims[1]; - T_ = py_desc_->dims[2]; + B_ = px_desc_->getDimIndex(0); + S_ = px_desc_->getDimIndex(1); + T_ = py_desc_->getDimIndex(2); px_index_ = MutualInformationForward::Index3D(S_, T_ + 1); py_index_ = MutualInformationForward::Index3D(S_ + 1, T_); diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/nms/nms.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/nms/nms.cpp index 0eb97444e..7e38df607 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/nms/nms.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/nms/nms.cpp @@ -41,11 +41,11 @@ void NmsExecutor::workspaceMalloc() { auto tensor_boxes = parser_->getMetaTensor("input1").tensor; auto tensor_confi = parser_->getMetaTensor("input2").tensor; auto input_layout = parser_->getProtoNode()->nms_param().input_layout(); - if (tensor_boxes->dim == 2) { + if (tensor_boxes->getDim() == 2) { if (input_layout == 0) { - box_dim_ = tensor_boxes->dims[1]; + box_dim_ = tensor_boxes->getDimIndex(1); } else { - box_dim_ = tensor_boxes->dims[0]; + box_dim_ = tensor_boxes->getDimIndex(0); } } VLOG(4) << "box_dim_: " << box_dim_; @@ -70,7 +70,7 @@ void NmsExecutor::workspaceMalloc() { // this op will modify input data. // when repeat != 1, after second compute(), input data has been modified. // input data changed, the result of this op ("result_num", - // aka output->dims[0]) is changed. + // aka output->getDimIndex(0)) is changed. // so we don't know result_num when repeat finished. // so ignore what result_num(valid data number in output0) is, // set all data in output0 as 0. @@ -105,8 +105,7 @@ void NmsExecutor::compute() { (mluOpNmsAlgo_t)parser_->getProtoNode()->nms_param().algo(); float offset = parser_->getProtoNode()->nms_param().offset(); auto input_layout = parser_->getProtoNode()->nms_param().input_layout(); - int max_output_size = - parser_->getProtoNode()->nms_param().max_output_boxes(); + int max_output_size = parser_->getProtoNode()->nms_param().max_output_boxes(); float confidence_threshold = parser_->getProtoNode()->nms_param().confidence_threshold(); bool pad_to_max_output_size = @@ -131,10 +130,9 @@ void NmsExecutor::compute() { VLOG(4) << "call mluop NmsTensor()"; mluOpNmsDescriptor_t nms_desc; - nms_desc = - cpu_runtime_.allocate(mluOpCreateNmsDescriptor, - mluOpDestroyNmsDescriptor); - VLOG(5) << "tensor_boxes->dim: " << tensor_boxes->dim; + nms_desc = cpu_runtime_.allocate(mluOpCreateNmsDescriptor, + mluOpDestroyNmsDescriptor); + VLOG(5) << "tensor_boxes->getDim(): " << tensor_boxes->getDim(); MLUOP_CHECK(mluOpSetNmsDescriptor( nms_desc, box_mode, mode, algo, method_mode, iou_threshold, soft_nms_sigma, max_output_size, confidence_threshold, offset, @@ -148,10 +146,9 @@ void NmsExecutor::compute() { interface_timer_.start(); VLOG(4) << "tensor_confi=" << tensor_confi << ", dev_confi=" << dev_confi << ", workspace_size_=" << workspace_size_; - MLUOP_CHECK(mluOpNms( - handle_, nms_desc, tensor_boxes, dev_boxes, tensor_confi, dev_confi, - workspace_, workspace_size_, tensor_output, dev_output, - dev_output_size)); + MLUOP_CHECK(mluOpNms(handle_, nms_desc, tensor_boxes, dev_boxes, tensor_confi, + dev_confi, workspace_, workspace_size_, tensor_output, + dev_output, dev_output_size)); interface_timer_.stop(); VLOG(4) << "mluOpNms-end"; @@ -439,29 +436,29 @@ void NmsExecutor::cpuCompute() { int input_batches_num = 1; int input_classes_num = 1; int box_dim = 4; - if (input_box_desc->dim == 2) { + if (input_box_desc->getDim() == 2) { if (input_layout == 0) { // when layout is [boxes_num, 4], dims[0] represets the input number. - input_boxes_num = input_box_desc->dims[0]; - box_dim = input_box_desc->dims[1]; + input_boxes_num = input_box_desc->getDimIndex(0); + box_dim = input_box_desc->getDimIndex(1); } else if (input_layout == 1) { - input_boxes_num = input_box_desc->dims[1]; - box_dim = input_box_desc->dims[0]; + input_boxes_num = input_box_desc->getDimIndex(1); + box_dim = input_box_desc->getDimIndex(0); } else { VLOG(4) << "unsupport input layout now."; } } else { - // assert input_box_desc->dim == 3 - input_batches_num = input_box_desc->dims[0]; - input_classes_num = input_conf_desc->dims[1]; + // assert input_box_desc->getDim() == 3 + input_batches_num = input_box_desc->getDimIndex(0); + input_classes_num = input_conf_desc->getDimIndex(1); // keep content of algo and offset, algo is deprecated at // setNmsDescriptor_v4 algo = mluOpNmsAlgo_t::MLUOP_NMS_ALGO_INCLUDE_BOUNDARY; if (input_layout == 0) { // when layout is [boxes_num, 4], dims[0] represets the input number. - input_boxes_num = input_box_desc->dims[1]; + input_boxes_num = input_box_desc->getDimIndex(1); } else if (input_layout == 1) { - input_boxes_num = input_box_desc->dims[2]; + input_boxes_num = input_box_desc->getDimIndex(2); } else { VLOG(4) << "unsupport input layout now."; } @@ -525,8 +522,9 @@ void NmsExecutor::diffPreprocess() { parser_->getProtoNode()->nms_param().pad_to_max_output_size(); int output_mode_num = 1; int box_dim = 4; - if (tensor_boxes->dim == 2) { - box_dim = input_layout == 0 ? tensor_boxes->dims[1] : tensor_boxes->dims[0]; + if (tensor_boxes->getDim() == 2) { + box_dim = input_layout == 0 ? tensor_boxes->getDimIndex(1) + : tensor_boxes->getDimIndex(0); } if (box_dim == 7) { mode = static_cast(0); @@ -661,25 +659,25 @@ int64_t NmsExecutor::getTheoryOps() { int input_batches_num = 0; int input_classes_num = 0; int box_dim = 4; - if (input_desc->dim == 2) { + if (input_desc->getDim() == 2) { if (input_layout == 0) { - input_boxes_num = input_desc->dims[0]; - box_dim = input_desc->dims[1]; + input_boxes_num = input_desc->getDimIndex(0); + box_dim = input_desc->getDimIndex(1); } else if (input_layout == 1) { - input_boxes_num = input_desc->dims[1]; - box_dim = input_desc->dims[0]; + input_boxes_num = input_desc->getDimIndex(1); + box_dim = input_desc->getDimIndex(0); } else { VLOG(4) << "unsupport input layout now."; } } else { - // assert input_desc->dim == 3 - input_batches_num = input_desc->dims[0]; - input_classes_num = input_conf_desc->dims[1]; + // assert input_desc->getDim() == 3 + input_batches_num = input_desc->getDimIndex(0); + input_classes_num = input_conf_desc->getDimIndex(1); if (input_layout == 0) { // when layout is [boxes_num, 4], dims[0] represets the input number. - input_boxes_num = input_desc->dims[1]; + input_boxes_num = input_desc->getDimIndex(1); } else if (input_layout == 1) { - input_boxes_num = input_desc->dims[2]; + input_boxes_num = input_desc->getDimIndex(2); } else { VLOG(4) << "unsupport input layout now."; } @@ -688,8 +686,8 @@ int64_t NmsExecutor::getTheoryOps() { float *input_boxes = NULL; float *input_conf = NULL; if (device == Device::GPU) { - auto boxes_dtype = input_desc->dtype; - auto conf_dtype = input_conf_desc->dtype; + auto boxes_dtype = input_desc->getDtype(); + auto conf_dtype = input_conf_desc->getDtype(); int boxes_count_num = mluOpGetTensorElementNum(input_desc); int conf_count_num = mluOpGetTensorElementNum(input_conf_desc); float *boxes_host = diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/nms_rotated/nms_rotated.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/nms_rotated/nms_rotated.cpp index 8fc555ecd..1171ae821 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/nms_rotated/nms_rotated.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/nms_rotated/nms_rotated.cpp @@ -105,7 +105,7 @@ void NmsRotatedExecutor::compute() { GTEST_CHECK(cnrtSuccess == cnrtMemset(result_num, 0, sizeof(int32_t))); // GTEST_CHECK(cnrtSuccess == cnrtMemset(dev_output, 0, - // output->dims[0] * sizeof(int64_t))); + // output->getDimIndex(0) * sizeof(int64_t))); VLOG(4) << "call mluOpNmsRotated()"; interface_timer_.start(); MLUOP_CHECK(mluOpGetNmsRotatedWorkspaceSize( @@ -124,8 +124,8 @@ void NmsRotatedExecutor::cpuCompute() { return; } - auto num_box = tensor_desc_[0].tensor->dims[0]; - auto box_dim = tensor_desc_[0].tensor->dims[1]; + auto num_box = tensor_desc_[0].tensor->getDimIndex(0); + auto box_dim = tensor_desc_[0].tensor->getDimIndex(1); float iou_threshold = parser_->getProtoNode()->nms_rotated_param().iou_threshold(); diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/points_in_boxes/points_in_boxes.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/points_in_boxes/points_in_boxes.cpp index 5b71a72d3..9f4fd1b88 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/points_in_boxes/points_in_boxes.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/points_in_boxes/points_in_boxes.cpp @@ -51,20 +51,22 @@ static void points_in_boxes_cpu( const mluOpTensorDescriptor_t points_desc, const void *points, const mluOpTensorDescriptor_t boxes_desc, const void *boxes, const mluOpTensorDescriptor_t points_indices_desc, void *points_indices) { - for (int64_t i = 0; - i < points_indices_desc->dims[0] * points_indices_desc->dims[1]; i++) { + for (int64_t i = 0; i < points_indices_desc->getDimIndex(0) * + points_indices_desc->getDimIndex(1); + i++) { *((float *)points_indices + i) = -1.0; } - for (int64_t i = 0; i < points_desc->dims[0]; i++) { - for (int64_t j = 0; j < points_desc->dims[1]; j++) { - for (int64_t m = 0; m < boxes_desc->dims[1]; m++) { + for (int64_t i = 0; i < points_desc->getDimIndex(0); i++) { + for (int64_t j = 0; j < points_desc->getDimIndex(1); j++) { + for (int64_t m = 0; m < boxes_desc->getDimIndex(1); m++) { float local_x, local_y; int cur_in_flag = check_pt_in_box3d_cpu( - (float *)points + (i * points_desc->dims[1] + j) * 3, - (float *)boxes + (i * boxes_desc->dims[1] + m) * 7, local_x, + (float *)points + (i * points_desc->getDimIndex(1) + j) * 3, + (float *)boxes + (i * boxes_desc->getDimIndex(1) + m) * 7, local_x, local_y); if (cur_in_flag) { - *((float *)points_indices + i * points_desc->dims[1] + j) = (float)m; + *((float *)points_indices + i * points_desc->getDimIndex(1) + j) = + (float)m; break; } } diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/poly_nms/poly_nms.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/poly_nms/poly_nms.cpp index 01dd4acd5..645782a21 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/poly_nms/poly_nms.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/poly_nms/poly_nms.cpp @@ -114,7 +114,7 @@ void PolyNmsExecutor::pnmsComputeCPU(float *output_data, int *output_box_num, void PolyNmsExecutor::cpuCompute() { float iou_thresh = parser_->getProtoNode()->poly_nms_param().iou_threshold(); auto input_box_desc = tensor_desc_[0].tensor; - int input_boxes_num = input_box_desc->dims[0]; + int input_boxes_num = input_box_desc->getDimIndex(0); VLOG(4) << "[mluOpPolyNms] cpu compute start, input_boxes_num: " << input_boxes_num; @@ -133,7 +133,7 @@ void PolyNmsExecutor::cpuCompute() { int64_t PolyNmsExecutor::getTheoryOps() { VLOG(4) << "getTheoryOps"; int64_t theory_ops = 21650; - int dims = parser_->getMetaTensor("input1").tensor->dims[0]; + int dims = parser_->getMetaTensor("input1").tensor->getDimIndex(0); theory_ops = theory_ops * dims * dims; int64_t sort_ops = dims * dims - dims; theory_ops += sort_ops; diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/prior_box/prior_box.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/prior_box/prior_box.cpp index 0643cccff..b5a8354e7 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/prior_box/prior_box.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/prior_box/prior_box.cpp @@ -192,12 +192,12 @@ void PriorBoxExecutor::cpuCompute() { float* variances = cpu_fp32_input_[2]; float* max_sizes = cpu_fp32_input_[3]; - const int min_sizes_num = min_sizes_desc_->total_element_num; - const int aspect_ratios_num = aspect_ratios_desc_->total_element_num; - const int variances_num = variances_desc_->total_element_num; - const int max_sizes_num = max_sizes_desc_->total_element_num; - const int output_num = output_desc_->total_element_num; - const int var_num = var_desc_->total_element_num; + const int min_sizes_num = min_sizes_desc_->getTotalElementNum(); + const int aspect_ratios_num = aspect_ratios_desc_->getTotalElementNum(); + const int variances_num = variances_desc_->getTotalElementNum(); + const int max_sizes_num = max_sizes_desc_->getTotalElementNum(); + const int output_num = output_desc_->getTotalElementNum(); + const int var_num = var_desc_->getTotalElementNum(); const float step_h = step_h_; const float step_w = step_w_; const float offset = offset_; diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/psamask_backward/psamask_backward.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/psamask_backward/psamask_backward.cpp index ab07fb416..6b7382791 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/psamask_backward/psamask_backward.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/psamask_backward/psamask_backward.cpp @@ -163,15 +163,15 @@ void PsamaskBackwardExecutor::cpuCompute() { int w_mask = parser_->getProtoNode()->psamask_backward_param().w_mask(); int psa_type = parser_->getProtoNode()->psamask_backward_param().psa_type(); - auto batch = input_desc->dims[0]; - auto buffer_c = input_desc->dims[3]; - auto h_feature = input_desc->dims[1]; - auto w_feature = input_desc->dims[2]; - auto mask_c = output_desc->dims[3]; + auto batch = input_desc->getDimIndex(0); + auto buffer_c = input_desc->getDimIndex(3); + auto h_feature = input_desc->getDimIndex(1); + auto w_feature = input_desc->getDimIndex(2); + auto mask_c = output_desc->getDimIndex(3); int half_h_mask = (h_mask - 1) / 2; int half_w_mask = (w_mask - 1) / 2; - auto input_data_type = input_desc->dtype; + auto input_data_type = input_desc->getDtype(); psamaskType_t psamask_type = (psamaskType_t)psa_type; void *input = (void *)cpu_fp32_input_[0]; diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/psamask_forward/psamask_forward.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/psamask_forward/psamask_forward.cpp index 349703f18..334dcbee2 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/psamask_forward/psamask_forward.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/psamask_forward/psamask_forward.cpp @@ -162,15 +162,15 @@ void PsamaskForwardExecutor::cpuCompute() { psamaskType_t psa_type = (psamaskType_t)parser_->getProtoNode() ->psamask_forward_param() .psa_type(); - auto batch = input_desc->dims[0]; - auto input_c = input_desc->dims[3]; - auto h_feature = input_desc->dims[1]; - auto w_feature = input_desc->dims[2]; - auto output_c = output_desc->dims[3]; + auto batch = input_desc->getDimIndex(0); + auto input_c = input_desc->getDimIndex(3); + auto h_feature = input_desc->getDimIndex(1); + auto w_feature = input_desc->getDimIndex(2); + auto output_c = output_desc->getDimIndex(3); int half_h_mask = (h_mask - 1) / 2; int half_w_mask = (w_mask - 1) / 2; - auto input_data_type = input_desc->dtype; + auto input_data_type = input_desc->getDtype(); psamaskType_t psamask_type = (psamaskType_t)psa_type; void *input = (void *)cpu_fp32_input_[0]; diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/psroipool_backward/psroipool_backward.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/psroipool_backward/psroipool_backward.cpp index 5a961a3a6..993c9036f 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/psroipool_backward/psroipool_backward.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/psroipool_backward/psroipool_backward.cpp @@ -77,13 +77,13 @@ void PsroipoolBackwardExecutor::cpuCompute() { auto rois_cpu = cpu_fp32_input_[2]; auto bottom_output_cpu = cpu_fp32_output_[0]; - const int bottom_n = bottom_output_desc->dims[0]; - const int bottom_h = bottom_output_desc->dims[1]; - const int bottom_w = bottom_output_desc->dims[2]; - const int bottom_c = bottom_output_desc->dims[3]; + const int bottom_n = bottom_output_desc->getDimIndex(0); + const int bottom_h = bottom_output_desc->getDimIndex(1); + const int bottom_w = bottom_output_desc->getDimIndex(2); + const int bottom_c = bottom_output_desc->getDimIndex(3); - const int rois_n = rois_desc->dims[0]; - const int rois_offset = rois_desc->dims[1]; + const int rois_n = rois_desc->getDimIndex(0); + const int rois_offset = rois_desc->getDimIndex(1); for (int roi_id = 0; roi_id < rois_n; roi_id++) { int top_batch_offset = diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/psroipool_forward/psroipool_forward.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/psroipool_forward/psroipool_forward.cpp index dc3667e94..469479b57 100755 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/psroipool_forward/psroipool_forward.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/psroipool_forward/psroipool_forward.cpp @@ -95,13 +95,13 @@ void PsroipoolForwardExecutor::cpuCompute() { auto output_cpu = cpu_fp32_output_[0]; auto mapping_channel_cpu = cpu_fp32_output_[1]; - const int input_n = input_desc->dims[0]; - const int input_h = input_desc->dims[1]; - const int input_w = input_desc->dims[2]; - const int input_c = input_desc->dims[3]; + const int input_n = input_desc->getDimIndex(0); + const int input_h = input_desc->getDimIndex(1); + const int input_w = input_desc->getDimIndex(2); + const int input_c = input_desc->getDimIndex(3); - const int rois_n = rois_desc->dims[0]; - const int rois_offset = rois_desc->dims[1]; + const int rois_n = rois_desc->getDimIndex(0); + const int rois_offset = rois_desc->getDimIndex(1); for (int roi_id = 0; roi_id < rois_n; roi_id++) { int out_batch_offset = diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/roi_align_backward/roi_align_backward.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/roi_align_backward/roi_align_backward.cpp index 86cc8aa9e..4defcad30 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/roi_align_backward/roi_align_backward.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/roi_align_backward/roi_align_backward.cpp @@ -159,20 +159,20 @@ void RoiAlignBackwardExecutor::cpuCompute() { parser_->getProtoNode()->roi_align_backward_param().pool_mode(); int version = parser_->getProtoNode()->roi_align_backward_param().version(); - size_t input_n = input_desc->dims[0]; - size_t input_h = input_desc->dims[1]; - size_t input_w = input_desc->dims[2]; - size_t input_c = input_desc->dims[3]; + size_t input_n = input_desc->getDimIndex(0); + size_t input_h = input_desc->getDimIndex(1); + size_t input_w = input_desc->getDimIndex(2); + size_t input_c = input_desc->getDimIndex(3); size_t input_offset_n = input_h * input_w * input_c; size_t input_offset_h = input_w * input_c; auto output = parser_->getMetaTensor(2).cpu_ptr; auto output_desc = parser_->getMetaTensor(2).tensor; if (pool_mode == 1) { - int output_n = output_desc->dims[0]; - int output_h = output_desc->dims[1]; - int output_w = output_desc->dims[2]; - int output_c = output_desc->dims[3]; + int output_n = output_desc->getDimIndex(0); + int output_h = output_desc->getDimIndex(1); + int output_w = output_desc->getDimIndex(2); + int output_c = output_desc->getDimIndex(3); std::memset(output, 0.0, parser_->getMetaTensor(2).size_in_bytes); size_t output_offset_n = output_h * output_w * output_c; @@ -253,10 +253,10 @@ void RoiAlignBackwardExecutor::cpuCompute() { output = parser_->getMetaTensor(4).cpu_ptr; output_desc = parser_->getMetaTensor(4).tensor; - size_t output_n = output_desc->dims[0]; - size_t output_h = output_desc->dims[1]; - size_t output_w = output_desc->dims[2]; - size_t output_c = output_desc->dims[3]; + size_t output_n = output_desc->getDimIndex(0); + size_t output_h = output_desc->getDimIndex(1); + size_t output_w = output_desc->getDimIndex(2); + size_t output_c = output_desc->getDimIndex(3); size_t output_offset_n = output_h * output_w * output_c; size_t output_offset_h = output_w * output_c; @@ -318,7 +318,7 @@ int64_t RoiAlignBackwardExecutor::getTheoryOps() { Device device = parser_->device(); if (device != Device::CPU) { auto boxes_desc = tensor_desc_[1].tensor; - auto boxes_dtype = boxes_desc->dtype; + auto boxes_dtype = boxes_desc->getDtype(); size_t boxes_num = parser_->getInputDataCount(1); float *boxes_ptr = (float *)cpu_runtime_.allocate(boxes_num * sizeof(float)); @@ -344,11 +344,11 @@ int64_t RoiAlignBackwardExecutor::getTheoryOps() { output_desc = parser_->getMetaTensor(4).tensor; } - size_t input_n = input_desc->dims[0]; - size_t input_h = input_desc->dims[1]; - size_t input_w = input_desc->dims[2]; - size_t input_c = input_desc->dims[3]; - size_t output_n = output_desc->dims[0]; + size_t input_n = input_desc->getDimIndex(0); + size_t input_h = input_desc->getDimIndex(1); + size_t input_w = input_desc->getDimIndex(2); + size_t input_c = input_desc->getDimIndex(3); + size_t output_n = output_desc->getDimIndex(0); for (int idx_n = 0; idx_n < input_n; idx_n++) { // check whether box_idx is valid diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/roi_align_rotated_backward/roi_align_rotated_backward.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/roi_align_rotated_backward/roi_align_rotated_backward.cpp index 919f74500..b155b1e37 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/roi_align_rotated_backward/roi_align_rotated_backward.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/roi_align_rotated_backward/roi_align_rotated_backward.cpp @@ -191,11 +191,11 @@ void RoiAlignRotatedBackwardExecutor::cpuCompute() { float *rois = cpu_fp32_input_[1]; // (n, 6) [batch_id, x, y, w, h, Θ] float *bottom_grad = cpu_fp32_output_[0]; - const int channel = top_grad_desc->dims[3]; - const int width = bottom_grad_desc->dims[2]; - const int height = bottom_grad_desc->dims[1]; - const int batch = bottom_grad_desc->dims[0]; - const int rois_nums = rois_desc->dims[0]; + const int channel = top_grad_desc->getDimIndex(3); + const int width = bottom_grad_desc->getDimIndex(2); + const int height = bottom_grad_desc->getDimIndex(1); + const int batch = bottom_grad_desc->getDimIndex(0); + const int rois_nums = rois_desc->getDimIndex(0); if (mluOpGetTensorElementNum(bottom_grad_desc) == 0) { return; @@ -300,7 +300,7 @@ int64_t RoiAlignRotatedBackwardExecutor::getTheoryOps() { if (unlikely(ts->empty())) { return 0; } - if (ts->dtype == MLUOP_DTYPE_FLOAT) { + if (ts->getDtype() == MLUOP_DTYPE_FLOAT) { ts->cpu_ptr = (float *)cpu_runtime_.allocate(ts->shape_count * ts->sizeof_dtype); parser_->getInputTensorValue(i, (void *)ts->cpu_ptr, ts->shape_count); @@ -309,7 +309,7 @@ int64_t RoiAlignRotatedBackwardExecutor::getTheoryOps() { parser_->getInputTensorValue(i, temp, ts->shape_count); ts->cpu_ptr = (float *)cpu_runtime_.allocate(ts->shape_count * sizeof(float)); - castDataOut(temp, ts->dtype, ts->cpu_ptr, MLUOP_DTYPE_FLOAT, + castDataOut(temp, ts->getDtype(), ts->cpu_ptr, MLUOP_DTYPE_FLOAT, ts->shape_count, NO_QUANT); } cpu_fp32_input_.push_back(ts->cpu_ptr); @@ -319,7 +319,7 @@ int64_t RoiAlignRotatedBackwardExecutor::getTheoryOps() { if (unlikely(ts->empty())) { return 0; } - if (ts->dtype == MLUOP_DTYPE_FLOAT) { + if (ts->getDtype() == MLUOP_DTYPE_FLOAT) { ts->cpu_ptr = (float *)cpu_runtime_.allocate(ts->shape_count * ts->sizeof_dtype); parser_->getOutputTensorValue(i, (void *)ts->cpu_ptr, ts->shape_count); @@ -328,7 +328,7 @@ int64_t RoiAlignRotatedBackwardExecutor::getTheoryOps() { parser_->getOutputTensorValue(i, temp, ts->shape_count); ts->cpu_ptr = (float *)cpu_runtime_.allocate(ts->shape_count * sizeof(float)); - castDataOut(temp, ts->dtype, ts->cpu_ptr, MLUOP_DTYPE_FLOAT, + castDataOut(temp, ts->getDtype(), ts->cpu_ptr, MLUOP_DTYPE_FLOAT, ts->shape_count, NO_QUANT); } cpu_fp32_output_.push_back(ts->cpu_ptr); diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/roi_align_rotated_forward/roi_align_rotated_forward.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/roi_align_rotated_forward/roi_align_rotated_forward.cpp index b17c99fb3..33e6cca99 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/roi_align_rotated_forward/roi_align_rotated_forward.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/roi_align_rotated_forward/roi_align_rotated_forward.cpp @@ -184,11 +184,11 @@ void RoiAlignRotatedForwardExecutor::cpuCompute() { float *rois = cpu_fp32_input_[1]; // (n, 6) [batch_id, x, y, w, h, Θ] float *output = cpu_fp32_output_[0]; - const int channel = features_desc->dims[3]; - const int width = features_desc->dims[2]; - const int height = features_desc->dims[1]; - const int batch = features_desc->dims[0]; - const int rois_nums = rois_desc->dims[0]; + const int channel = features_desc->getDimIndex(3); + const int width = features_desc->getDimIndex(2); + const int height = features_desc->getDimIndex(1); + const int batch = features_desc->getDimIndex(0); + const int rois_nums = rois_desc->getDimIndex(0); if (mluOpGetTensorElementNum(features_desc) == 0) { return; @@ -288,7 +288,7 @@ int64_t RoiAlignRotatedForwardExecutor::getTheoryOps() { if (unlikely(ts->empty())) { return 0; } - if (ts->dtype == MLUOP_DTYPE_FLOAT) { + if (ts->getDtype() == MLUOP_DTYPE_FLOAT) { ts->cpu_ptr = (float *)cpu_runtime_.allocate(ts->shape_count * ts->sizeof_dtype); parser_->getInputTensorValue(i, (void *)ts->cpu_ptr, ts->shape_count); @@ -297,7 +297,7 @@ int64_t RoiAlignRotatedForwardExecutor::getTheoryOps() { parser_->getInputTensorValue(i, temp, ts->shape_count); ts->cpu_ptr = (float *)cpu_runtime_.allocate(ts->shape_count * sizeof(float)); - castDataOut(temp, ts->dtype, ts->cpu_ptr, MLUOP_DTYPE_FLOAT, + castDataOut(temp, ts->getDtype(), ts->cpu_ptr, MLUOP_DTYPE_FLOAT, ts->shape_count, NO_QUANT); } cpu_fp32_input_.push_back(ts->cpu_ptr); @@ -307,7 +307,7 @@ int64_t RoiAlignRotatedForwardExecutor::getTheoryOps() { if (unlikely(ts->empty())) { return 0; } - if (ts->dtype == MLUOP_DTYPE_FLOAT) { + if (ts->getDtype() == MLUOP_DTYPE_FLOAT) { ts->cpu_ptr = (float *)cpu_runtime_.allocate(ts->shape_count * ts->sizeof_dtype); parser_->getOutputTensorValue(i, (void *)ts->cpu_ptr, ts->shape_count); @@ -316,7 +316,7 @@ int64_t RoiAlignRotatedForwardExecutor::getTheoryOps() { parser_->getOutputTensorValue(i, temp, ts->shape_count); ts->cpu_ptr = (float *)cpu_runtime_.allocate(ts->shape_count * sizeof(float)); - castDataOut(temp, ts->dtype, ts->cpu_ptr, MLUOP_DTYPE_FLOAT, + castDataOut(temp, ts->getDtype(), ts->cpu_ptr, MLUOP_DTYPE_FLOAT, ts->shape_count, NO_QUANT); } cpu_fp32_output_.push_back(ts->cpu_ptr); diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/roi_crop_backward/roi_crop_backward.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/roi_crop_backward/roi_crop_backward.cpp index 4acf5d3af..672d51f18 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/roi_crop_backward/roi_crop_backward.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/roi_crop_backward/roi_crop_backward.cpp @@ -40,13 +40,13 @@ void RoiCropBackwardExecutor::initData() { grad_output_desc_ = tensor_desc_[0].tensor; grid_desc_ = tensor_desc_[1].tensor; grad_input_desc_ = tensor_desc_[2].tensor; - grad_output_h_ = grad_output_desc_->dims[1]; - grad_output_w_ = grad_output_desc_->dims[2]; - grid_batch_roi_ = grid_desc_->dims[0]; - grad_input_batch_ = grad_input_desc_->dims[0]; - grad_input_h_ = grad_input_desc_->dims[1]; - grad_input_w_ = grad_input_desc_->dims[2]; - grad_input_c_ = grad_input_desc_->dims[3]; + grad_output_h_ = grad_output_desc_->getDimIndex(1); + grad_output_w_ = grad_output_desc_->getDimIndex(2); + grid_batch_roi_ = grid_desc_->getDimIndex(0); + grad_input_batch_ = grad_input_desc_->getDimIndex(0); + grad_input_h_ = grad_input_desc_->getDimIndex(1); + grad_input_w_ = grad_input_desc_->getDimIndex(2); + grad_input_c_ = grad_input_desc_->getDimIndex(3); VLOG(4) << "[RoiCropBackwardExecutor] call initData() end."; } diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/roi_crop_forward/roi_crop_forward.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/roi_crop_forward/roi_crop_forward.cpp index a4c6bd0ba..a475d7467 100755 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/roi_crop_forward/roi_crop_forward.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/roi_crop_forward/roi_crop_forward.cpp @@ -40,13 +40,13 @@ void RoiCropForwardExecutor::initData() { input_desc_ = tensor_desc_[0].tensor; grid_desc_ = tensor_desc_[1].tensor; output_desc_ = tensor_desc_[2].tensor; - input_batch_ = input_desc_->dims[0]; - input_h_ = input_desc_->dims[1]; - input_w_ = input_desc_->dims[2]; - input_c_ = input_desc_->dims[3]; - grid_batch_roi_ = grid_desc_->dims[0]; - output_h_ = output_desc_->dims[1]; - output_w_ = output_desc_->dims[2]; + input_batch_ = input_desc_->getDimIndex(0); + input_h_ = input_desc_->getDimIndex(1); + input_w_ = input_desc_->getDimIndex(2); + input_c_ = input_desc_->getDimIndex(3); + grid_batch_roi_ = grid_desc_->getDimIndex(0); + output_h_ = output_desc_->getDimIndex(1); + output_w_ = output_desc_->getDimIndex(2); VLOG(4) << "[RoiCropForwardExecutor] call initData() End."; } diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/roi_pooling_backward/roi_pooling_backward.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/roi_pooling_backward/roi_pooling_backward.cpp index df34ac217..97a24d360 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/roi_pooling_backward/roi_pooling_backward.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/roi_pooling_backward/roi_pooling_backward.cpp @@ -78,20 +78,20 @@ void RoiPoolingBackwardExecutor::cpuCompute() { PoolingForwardMode mode = parser_->getProtoNode()->roi_pooling_backward_param().mode(); - size_t grads_n = grads_desc->dims[0]; - size_t grads_h = grads_desc->dims[1]; - size_t grads_w = grads_desc->dims[2]; - size_t grads_c = grads_desc->dims[3]; - size_t num1 = rois_desc->dims[0]; - size_t num2 = rois_desc->dims[1]; - size_t argmax_n = argmax_desc->dims[0]; - size_t argmax_h = argmax_desc->dims[1]; - size_t argmax_w = argmax_desc->dims[2]; - size_t argmax_c = argmax_desc->dims[3]; - size_t grads_image_n = grads_image_desc->dims[0]; - size_t grads_image_h = grads_image_desc->dims[1]; - size_t grads_image_w = grads_image_desc->dims[2]; - size_t grads_image_c = grads_image_desc->dims[3]; + size_t grads_n = grads_desc->getDimIndex(0); + size_t grads_h = grads_desc->getDimIndex(1); + size_t grads_w = grads_desc->getDimIndex(2); + size_t grads_c = grads_desc->getDimIndex(3); + size_t num1 = rois_desc->getDimIndex(0); + size_t num2 = rois_desc->getDimIndex(1); + size_t argmax_n = argmax_desc->getDimIndex(0); + size_t argmax_h = argmax_desc->getDimIndex(1); + size_t argmax_w = argmax_desc->getDimIndex(2); + size_t argmax_c = argmax_desc->getDimIndex(3); + size_t grads_image_n = grads_image_desc->getDimIndex(0); + size_t grads_image_h = grads_image_desc->getDimIndex(1); + size_t grads_image_w = grads_image_desc->getDimIndex(2); + size_t grads_image_c = grads_image_desc->getDimIndex(3); const int batch_size = grads_image_n; const int channels = grads_image_c; @@ -192,7 +192,7 @@ int64_t RoiPoolingBackwardExecutor::getTheoryOps() { Device device = parser_->device(); if (device != Device::CPU) { auto argmax_desc = tensor_desc_[2].tensor; - auto argmax_dtype = argmax_desc->dtype; + auto argmax_dtype = argmax_desc->getDtype(); size_t argmax_num = parser_->getInputDataCount(2); float *argmax = (float *)cpu_runtime_.allocate(argmax_num * sizeof(float)); castDataOut(data_vector_[2].host_ptr, argmax_dtype, (float *)argmax, @@ -205,14 +205,14 @@ int64_t RoiPoolingBackwardExecutor::getTheoryOps() { auto grads_desc = parser_->getMetaTensor(0).tensor; auto grads_image_desc = parser_->getMetaTensor(3).tensor; - size_t grads_n = grads_desc->dims[0]; - size_t grads_h = grads_desc->dims[1]; - size_t grads_w = grads_desc->dims[2]; - size_t grads_c = grads_desc->dims[3]; - size_t grads_image_n = grads_image_desc->dims[0]; - size_t grads_image_h = grads_image_desc->dims[1]; - size_t grads_image_w = grads_image_desc->dims[2]; - size_t grads_image_c = grads_image_desc->dims[3]; + size_t grads_n = grads_desc->getDimIndex(0); + size_t grads_h = grads_desc->getDimIndex(1); + size_t grads_w = grads_desc->getDimIndex(2); + size_t grads_c = grads_desc->getDimIndex(3); + size_t grads_image_n = grads_image_desc->getDimIndex(0); + size_t grads_image_h = grads_image_desc->getDimIndex(1); + size_t grads_image_w = grads_image_desc->getDimIndex(2); + size_t grads_image_c = grads_image_desc->getDimIndex(3); theory_ops += grads_image_n * grads_image_h * grads_image_w * grads_image_c; for (size_t i = 0; i < grads_n * grads_h * grads_w * grads_c; i++) { diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/roi_pooling_forward/roi_pooling_forward.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/roi_pooling_forward/roi_pooling_forward.cpp index 19eadd31b..bbcee5475 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/roi_pooling_forward/roi_pooling_forward.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/roi_pooling_forward/roi_pooling_forward.cpp @@ -27,7 +27,7 @@ #define getParam(ty, ctx) \ (ty) parser_->getProtoNode()->roi_pooling_forward_param().ctx() -#define getTensorDims(x, y) tensor_desc_[x].tensor->dims[y] +#define getTensorDims(x, y) tensor_desc_[x].tensor->getDimIndex(y) #define getTensorDesc(x) tensor_desc_[x].tensor #define getDevicePtr(x) data_vector_[x].device_ptr diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/roialign_forward/roialign_forward.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/roialign_forward/roialign_forward.cpp index 6ea61953d..b84da7fb8 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/roialign_forward/roialign_forward.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/roialign_forward/roialign_forward.cpp @@ -51,8 +51,8 @@ void RoialignForwardExecutor::compute() { auto input_desc = parser_->getMetaTensor(0).tensor; auto input_rois_desc = parser_->getMetaTensor(1).tensor; auto output_desc = parser_->getMetaTensor(2).tensor; - int pooled_height = output_desc->dims[1]; - int pooled_width = output_desc->dims[2]; + int pooled_height = output_desc->getDimIndex(1); + int pooled_width = output_desc->getDimIndex(2); mluOpRoiAlignForwardDescriptor_t roialign_desc; mluOpCreateRoiAlignForwardDescriptor(&roialign_desc); @@ -136,14 +136,14 @@ void RoialignForwardExecutor::cpuCompute() { int verison = parser_->getProtoNode()->roialign_param().version(); int pool_mode = parser_->getProtoNode()->roialign_param().pool_mode(); - int input_height = input_desc->dims[1]; - int input_width = input_desc->dims[2]; - int pooled_height = output_desc->dims[1]; - int pooled_width = output_desc->dims[2]; - int channels = input_desc->dims[3]; - int num_rois = input_rois_desc->dims[0]; - int roi_offset = input_rois_desc->dims[1]; - int input_n = input_desc->dims[0]; + int input_height = input_desc->getDimIndex(1); + int input_width = input_desc->getDimIndex(2); + int pooled_height = output_desc->getDimIndex(1); + int pooled_width = output_desc->getDimIndex(2); + int channels = input_desc->getDimIndex(3); + int num_rois = input_rois_desc->getDimIndex(0); + int roi_offset = input_rois_desc->getDimIndex(1); + int input_n = input_desc->getDimIndex(0); float *input = cpu_fp32_input_[0]; float *input_rois = cpu_fp32_input_[1]; // (n, 5) { n, x0, y0, x1, y1} @@ -383,20 +383,20 @@ int64_t RoialignForwardExecutor::getTheoryOps() { auto input_rois_desc = parser_->getMetaTensor(1).tensor; auto output_desc = parser_->getMetaTensor(2).tensor; - int input_height = input_desc->dims[1]; - int input_width = input_desc->dims[2]; - int pooled_height = output_desc->dims[1]; - int pooled_width = output_desc->dims[2]; - int channels = input_desc->dims[3]; - int num_rois = input_rois_desc->dims[0]; - int roi_offset = input_rois_desc->dims[1]; + int input_height = input_desc->getDimIndex(1); + int input_width = input_desc->getDimIndex(2); + int pooled_height = output_desc->getDimIndex(1); + int pooled_width = output_desc->getDimIndex(2); + int channels = input_desc->getDimIndex(3); + int num_rois = input_rois_desc->getDimIndex(0); + int roi_offset = input_rois_desc->getDimIndex(1); int64_t theory_ops = 0; Device device = parser_->device(); float *input_rois = NULL; - auto rois_dtype = input_rois_desc->dtype; - int rois_count_num = num_rois * input_rois_desc->dims[1]; + auto rois_dtype = input_rois_desc->getDtype(); + int rois_count_num = num_rois * input_rois_desc->getDimIndex(1); float *rois_host = (float *)cpu_runtime_.allocate(rois_count_num * sizeof(float)); castDataOut(data_vector_[1].host_ptr, rois_dtype, (float *)rois_host, @@ -465,20 +465,20 @@ int64_t RoialignForwardExecutor::getTheoryIoSize() { auto output_desc = parser_->getMetaTensor(2).tensor; int pool_mode = parser_->getProtoNode()->roialign_param().pool_mode(); - int input_height = input_desc->dims[1]; - int input_width = input_desc->dims[2]; - int pooled_height = output_desc->dims[1]; - int pooled_width = output_desc->dims[2]; - int channels = input_desc->dims[3]; - int num_rois = input_rois_desc->dims[0]; - int roi_offset = input_rois_desc->dims[1]; + int input_height = input_desc->getDimIndex(1); + int input_width = input_desc->getDimIndex(2); + int pooled_height = output_desc->getDimIndex(1); + int pooled_width = output_desc->getDimIndex(2); + int channels = input_desc->getDimIndex(3); + int num_rois = input_rois_desc->getDimIndex(0); + int roi_offset = input_rois_desc->getDimIndex(1); int64_t theory_io_size = 0; Device device = parser_->device(); float *input_rois = NULL; - auto rois_dtype = input_rois_desc->dtype; - int rois_count_num = num_rois * input_rois_desc->dims[1]; + auto rois_dtype = input_rois_desc->getDtype(); + int rois_count_num = num_rois * input_rois_desc->getDimIndex(1); float *rois_host = (float *)cpu_runtime_.allocate(rois_count_num * sizeof(float)); castDataOut(data_vector_[1].host_ptr, rois_dtype, (float *)rois_host, diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/roiaware_pool3d_backward/roiaware_pool3d_backward.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/roiaware_pool3d_backward/roiaware_pool3d_backward.cpp index 55e85b38d..165a1d73c 100755 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/roiaware_pool3d_backward/roiaware_pool3d_backward.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/roiaware_pool3d_backward/roiaware_pool3d_backward.cpp @@ -154,7 +154,7 @@ void RoiawarePool3dBackwardExecutor::initData() { max_pts_each_voxel_ = roiaware_pool3d_backward_proto_desc.max_pts_each_voxel(); - pts_num_ = desc_grad_in_->dims[0]; + pts_num_ = desc_grad_in_->getDimIndex(0); VLOG(4) << "RoiawarePool3dBackwardExecutor::initData() End."; } diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/roipoint_pool3d/roipoint_pool3d.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/roipoint_pool3d/roipoint_pool3d.cpp index 1637d5e6e..56a0dfba5 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/roipoint_pool3d/roipoint_pool3d.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/roipoint_pool3d/roipoint_pool3d.cpp @@ -191,11 +191,11 @@ void RoipointPool3dExecutor::workspaceMalloc() { auto tensor_pooled_features = tensor_desc_[3].tensor; auto tensor_pooled_empty_flag = tensor_desc_[4].tensor; - int batch_size = tensor_points->dims[0]; - int pts_num = tensor_points->dims[1]; - int boxes_num = tensor_boxes3d->dims[1]; - int feature_len = tensor_point_features->dims[2]; - int sampled_pts_num = tensor_pooled_features->dims[2]; + int batch_size = tensor_points->getDimIndex(0); + int pts_num = tensor_points->getDimIndex(1); + int boxes_num = tensor_boxes3d->getDimIndex(1); + int feature_len = tensor_point_features->getDimIndex(2); + int sampled_pts_num = tensor_pooled_features->getDimIndex(2); void *workspace_ptr = nullptr; MLUOP_CHECK(mluOpGetRoiPointPool3dWorkspaceSize( @@ -223,10 +223,10 @@ void RoipointPool3dExecutor::compute() { auto tensor_pooled_features = tensor_desc_[3].tensor; auto tensor_pooled_empty_flag = tensor_desc_[4].tensor; - int batch_size = tensor_points->dims[0]; - int pts_num = tensor_points->dims[1]; - int boxes_num = tensor_boxes3d->dims[1]; - int feature_len = tensor_point_features->dims[2]; + int batch_size = tensor_points->getDimIndex(0); + int pts_num = tensor_points->getDimIndex(1); + int boxes_num = tensor_boxes3d->getDimIndex(1); + int feature_len = tensor_point_features->getDimIndex(2); auto dev_points = data_vector_[0].device_ptr; auto dev_point_features = data_vector_[1].device_ptr; @@ -253,11 +253,11 @@ void RoipointPool3dExecutor::cpuCompute() { auto tensor_pooled_features = tensor_desc_[3].tensor; auto tensor_pooled_empty_flag = tensor_desc_[4].tensor; - int batch_size = tensor_points->dims[0]; - int pts_num = tensor_points->dims[1]; - int boxes_num = tensor_boxes3d->dims[1]; - int feature_len = tensor_point_features->dims[2]; - int sampled_pts_num = tensor_pooled_features->dims[2]; + int batch_size = tensor_points->getDimIndex(0); + int pts_num = tensor_points->getDimIndex(1); + int boxes_num = tensor_boxes3d->getDimIndex(1); + int feature_len = tensor_point_features->getDimIndex(2); + int sampled_pts_num = tensor_pooled_features->getDimIndex(2); auto points = cpu_fp32_input_[0]; auto point_features = cpu_fp32_input_[1]; @@ -299,11 +299,11 @@ int64_t RoipointPool3dExecutor::getTheoryOps() { auto tensor_pooled_features = tensor_desc_[3].tensor; auto tensor_pooled_empty_flag = tensor_desc_[4].tensor; - int64_t batch_size = tensor_points->dims[0]; - int64_t pts_num = tensor_points->dims[1]; - int64_t boxes_num = tensor_boxes3d->dims[1]; - int64_t feature_len = tensor_point_features->dims[2]; - int64_t sampled_pts_num = tensor_pooled_features->dims[2]; + int64_t batch_size = tensor_points->getDimIndex(0); + int64_t pts_num = tensor_points->getDimIndex(1); + int64_t boxes_num = tensor_boxes3d->getDimIndex(1); + int64_t feature_len = tensor_point_features->getDimIndex(2); + int64_t sampled_pts_num = tensor_pooled_features->getDimIndex(2); int64_t count = 21 + feature_len; int64_t theory_ops = batch_size * pts_num * count * boxes_num; diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/rotated_feature_align_backward/rotated_feature_align_backward.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/rotated_feature_align_backward/rotated_feature_align_backward.cpp index 306f23c66..54c0d74f6 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/rotated_feature_align_backward/rotated_feature_align_backward.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/rotated_feature_align_backward/rotated_feature_align_backward.cpp @@ -103,11 +103,11 @@ void RotatedFeatureAlignBackwardExecutor::cpuCompute() { const int output_size = parser_->getOutputDataCount(0); auto top_output_desc = tensor_desc_[0].tensor; auto bboxes_desc = tensor_desc_[1].tensor; - const int batch = top_output_desc->dims[0]; - const int height = top_output_desc->dims[1]; - const int width = top_output_desc->dims[2]; - const int channels = top_output_desc->dims[3]; - const int bboxes_offset = bboxes_desc->dims[3]; + const int batch = top_output_desc->getDimIndex(0); + const int height = top_output_desc->getDimIndex(1); + const int width = top_output_desc->getDimIndex(2); + const int channels = top_output_desc->getDimIndex(3); + const int bboxes_offset = bboxes_desc->getDimIndex(3); float px[5] = {0, 0, 0, 0, 0}; float py[5] = {0, 0, 0, 0, 0}; for (int index = 0; index < output_size; index++) { diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/rotated_feature_align_forward/rotated_feature_align_forward.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/rotated_feature_align_forward/rotated_feature_align_forward.cpp index df0b085dd..e2217444c 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/rotated_feature_align_forward/rotated_feature_align_forward.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/rotated_feature_align_forward/rotated_feature_align_forward.cpp @@ -109,11 +109,11 @@ void RotatedFeatureAlignForwardExecutor::cpuCompute() { auto input_desc = tensor_desc_[0].tensor; auto bboxes_desc = tensor_desc_[1].tensor; - const int batch = input_desc->dims[0]; - const int height = input_desc->dims[1]; - const int width = input_desc->dims[2]; - const int channels = input_desc->dims[3]; - const int bboxes_offset = bboxes_desc->dims[3]; + const int batch = input_desc->getDimIndex(0); + const int height = input_desc->getDimIndex(1); + const int width = input_desc->getDimIndex(2); + const int channels = input_desc->getDimIndex(3); + const int bboxes_offset = bboxes_desc->getDimIndex(3); float px[5] = {0, 0, 0, 0, 0}; float py[5] = {0, 0, 0, 0, 0}; diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batch_norm_backward_elemt/sync_batch_norm_backward_elemt.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batch_norm_backward_elemt/sync_batch_norm_backward_elemt.cpp index 8334dd344..d29beb6e7 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batch_norm_backward_elemt/sync_batch_norm_backward_elemt.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batch_norm_backward_elemt/sync_batch_norm_backward_elemt.cpp @@ -108,7 +108,8 @@ void SyncBatchNormBackwardElemtExecutor::compute() { void SyncBatchNormBackwardElemtExecutor::cpuCompute() { int len_x = parser_->getInputDataCount(0); - int len_c = tensor_desc_[0].tensor->dims[tensor_desc_[0].tensor->dim - 1]; + int len_c = + tensor_desc_[0].tensor->getDimIndex(tensor_desc_[0].tensor->getDim() - 1); if (len_x == 0 || len_c == 0) { VLOG(4) << "SyncBatchNormBackwardElemtExecutor: cpu compute zero elemt"; @@ -143,7 +144,8 @@ void SyncBatchNormBackwardElemtExecutor::cpuCompute() { int64_t SyncBatchNormBackwardElemtExecutor::getTheoryOps() { int64_t theory_ops = 0; int len_x = parser_->getInputDataCount(0); - int len_c = tensor_desc_[0].tensor->dims[tensor_desc_[0].tensor->dim - 1]; + int len_c = + tensor_desc_[0].tensor->getDimIndex(tensor_desc_[0].tensor->getDim() - 1); if (parser_->getInputNum() == 7) { theory_ops = 5 * len_x + 3 * len_c; } else { diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_backward_elemt_v2/sync_batchnorm_backward_elemt_v2.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_backward_elemt_v2/sync_batchnorm_backward_elemt_v2.cpp index 0c67b8520..79e3c9a9e 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_backward_elemt_v2/sync_batchnorm_backward_elemt_v2.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_backward_elemt_v2/sync_batchnorm_backward_elemt_v2.cpp @@ -115,8 +115,9 @@ void SyncBatchnormBackwardElemtV2Executor::compute() { void SyncBatchnormBackwardElemtV2Executor::cpuCompute() { int len_x = parser_->getInputDataCount(0); - int len_c = tensor_desc_[0].tensor->dims[tensor_desc_[0].tensor->dim - 1]; - int len_n = tensor_desc_[0].tensor->dims[0]; + int len_c = + tensor_desc_[0].tensor->getDimIndex(tensor_desc_[0].tensor->getDim() - 1); + int len_n = tensor_desc_[0].tensor->getDimIndex(0); if (len_x == 0 || len_c == 0) { VLOG(4) << "SyncBatchnormBackwardElemtV2Executor: cpu compute zero elemt"; @@ -158,7 +159,8 @@ void SyncBatchnormBackwardElemtV2Executor::cpuCompute() { int64_t SyncBatchnormBackwardElemtV2Executor::getTheoryOps() { int64_t theory_ops = 0; int len_x = parser_->getInputDataCount(0); - int len_c = tensor_desc_[0].tensor->dims[tensor_desc_[0].tensor->dim - 1]; + int len_c = + tensor_desc_[0].tensor->getDimIndex(tensor_desc_[0].tensor->getDim() - 1); if (parser_->getInputNum() == 7) { theory_ops = 5 * len_x + 3 * len_c; } else { diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_backward_reduce/sync_batchnorm_backward_reduce.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_backward_reduce/sync_batchnorm_backward_reduce.cpp index b0b415fd2..fdf38fc66 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_backward_reduce/sync_batchnorm_backward_reduce.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_backward_reduce/sync_batchnorm_backward_reduce.cpp @@ -230,7 +230,8 @@ void cpuGetSyncBnBkwReduceOuput( } void SyncBatchnormBackwardReduceExecutor::cpuCompute() { - int len_c = tensor_desc_[0].tensor->dims[tensor_desc_[0].tensor->dim - 1]; + int len_c = + tensor_desc_[0].tensor->getDimIndex(tensor_desc_[0].tensor->getDim() - 1); int len_x = parser_->getInputDataCount(0); const bool needs_input_grad0 = parser_->getProtoNode() ->sync_batchnorm_backward_reduce_param() diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_elemt/sync_batchnorm_elemt.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_elemt/sync_batchnorm_elemt.cpp index c1db5a4a6..6b174ab61 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_elemt/sync_batchnorm_elemt.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_elemt/sync_batchnorm_elemt.cpp @@ -80,7 +80,8 @@ void cpuSyncBNElemt(const float *x, const float *mean, const float *invstd, } void SyncBatchnormElemtExecutor::cpuCompute() { - int len_c = tensor_desc_[0].tensor->dims[tensor_desc_[0].tensor->dim - 1]; + int len_c = + tensor_desc_[0].tensor->getDimIndex(tensor_desc_[0].tensor->getDim() - 1); int len_x = parser_->getInputDataCount(0); VLOG(4) << "SyncBatchnormElemtExecutor: cpu compute begin"; diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_gather_stats_with_counts/sync_batchnorm_gather_stats_with_counts.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_gather_stats_with_counts/sync_batchnorm_gather_stats_with_counts.cpp index 72832d21e..7c568f323 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_gather_stats_with_counts/sync_batchnorm_gather_stats_with_counts.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_gather_stats_with_counts/sync_batchnorm_gather_stats_with_counts.cpp @@ -222,33 +222,33 @@ void SyncBatchnormGatherStatsWithCountsExecutor::cpuCompute() { ->sync_batchnorm_gather_stats_with_counts_param() .momentum(); - int idx_c = tensor_desc_[0].tensor->dim - 1; - int len_c = tensor_desc_[0].tensor->dims[idx_c]; + int idx_c = tensor_desc_[0].tensor->getDim() - 1; + int len_c = tensor_desc_[0].tensor->getDimIndex(idx_c); int len_count_all = 1; int len_mean_all = 1; int len_invstd_all = 1; if (parser_->getInputNum() == 3) { - len_count_all = tensor_desc_[2].tensor->dims[0]; + len_count_all = tensor_desc_[2].tensor->getDimIndex(0); } else if (parser_->getInputNum() == 4) { - len_count_all = tensor_desc_[3].tensor->dims[0]; + len_count_all = tensor_desc_[3].tensor->getDimIndex(0); } else if (parser_->getInputNum() == 5) { - len_count_all = tensor_desc_[4].tensor->dims[0]; + len_count_all = tensor_desc_[4].tensor->getDimIndex(0); } else if (parser_->getInputNum() == 6) { - len_count_all = tensor_desc_[5].tensor->dims[0]; + len_count_all = tensor_desc_[5].tensor->getDimIndex(0); } if (parser_->getInputNum() == 3 || parser_->getInputNum() == 5) { - for (int i = 0; i < tensor_desc_[0].tensor->dim; ++i) { - len_mean_all *= tensor_desc_[0].tensor->dims[i]; + for (int i = 0; i < tensor_desc_[0].tensor->getDim(); ++i) { + len_mean_all *= tensor_desc_[0].tensor->getDimIndex(i); } - for (int i = 0; i < tensor_desc_[1].tensor->dim; ++i) { - len_invstd_all *= tensor_desc_[1].tensor->dims[i]; + for (int i = 0; i < tensor_desc_[1].tensor->getDim(); ++i) { + len_invstd_all *= tensor_desc_[1].tensor->getDimIndex(i); } } else { - for (int i = 0; i < tensor_desc_[1].tensor->dim; ++i) { - len_mean_all *= tensor_desc_[1].tensor->dims[i]; + for (int i = 0; i < tensor_desc_[1].tensor->getDim(); ++i) { + len_mean_all *= tensor_desc_[1].tensor->getDimIndex(i); } - for (int i = 0; i < tensor_desc_[2].tensor->dim; ++i) { - len_invstd_all *= tensor_desc_[2].tensor->dims[i]; + for (int i = 0; i < tensor_desc_[2].tensor->getDim(); ++i) { + len_invstd_all *= tensor_desc_[2].tensor->getDimIndex(i); } } if (len_mean_all == 0 || len_c == 0 || len_count_all == 0 || diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_stats/sync_batchnorm_stats.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_stats/sync_batchnorm_stats.cpp index e32d9d211..4a2e02644 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_stats/sync_batchnorm_stats.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_stats/sync_batchnorm_stats.cpp @@ -117,11 +117,11 @@ void cpuSyncBatchNormStats(const float *x, const float eps, float *mean, void SyncBatchnormStatsExecutor::cpuCompute() { float eps = parser_->getProtoNode()->sync_batchnorm_stats_param().eps(); - int idx_c = tensor_desc_[0].tensor->dim - 1; - int len_c = tensor_desc_[0].tensor->dims[idx_c]; + int idx_c = tensor_desc_[0].tensor->getDim() - 1; + int len_c = tensor_desc_[0].tensor->getDimIndex(idx_c); int len_x = 1; - for (int i = 0; i < tensor_desc_[0].tensor->dim; ++i) { - len_x *= tensor_desc_[0].tensor->dims[i]; + for (int i = 0; i < tensor_desc_[0].tensor->getDim(); ++i) { + len_x *= tensor_desc_[0].tensor->getDimIndex(i); } if (len_x == 0 || len_c == 0) { return; diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/three_interpolate_backward/three_interpolate_backward.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/three_interpolate_backward/three_interpolate_backward.cpp index d71e68098..50ffb33b9 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/three_interpolate_backward/three_interpolate_backward.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/three_interpolate_backward/three_interpolate_backward.cpp @@ -41,10 +41,10 @@ void ThreeInterpolateBackwardExecutor::compute() { auto indices_data_ptr = data_vector_[1].device_ptr; auto weights_data_ptr = data_vector_[2].device_ptr; auto grad_features_data_ptr = data_vector_[3].device_ptr; - b_ = grad_output_desc->dims[0]; - c_ = grad_output_desc->dims[1]; - n_ = grad_output_desc->dims[2]; - m_ = grad_features_desc->dims[2]; + b_ = grad_output_desc->getDimIndex(0); + c_ = grad_output_desc->getDimIndex(1); + n_ = grad_output_desc->getDimIndex(2); + m_ = grad_features_desc->getDimIndex(2); VLOG(4) << "call mluOpThreeInterpolateBackward()"; interface_timer_.start(); MLUOP_CHECK(mluOpThreeInterpolateBackward( diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/three_interpolate_forward/three_interpolate_forward.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/three_interpolate_forward/three_interpolate_forward.cpp index c428fcb35..0b6038c4b 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/three_interpolate_forward/three_interpolate_forward.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/three_interpolate_forward/three_interpolate_forward.cpp @@ -41,10 +41,10 @@ void ThreeInterpolateForwardExecutor::compute() { auto indices_data_ptr = data_vector_[1].device_ptr; auto weights_data_ptr = data_vector_[2].device_ptr; auto output_data_ptr = data_vector_[3].device_ptr; - b_ = features_desc->dims[0]; - c_ = features_desc->dims[1]; - m_ = features_desc->dims[2]; - n_ = output_desc->dims[2]; + b_ = features_desc->getDimIndex(0); + c_ = features_desc->getDimIndex(1); + m_ = features_desc->getDimIndex(2); + n_ = output_desc->getDimIndex(2); VLOG(4) << "call mluOpThreeInterpolateForward()"; interface_timer_.start(); MLUOP_CHECK(mluOpThreeInterpolateForward( diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/tin_shift_backward/tin_shift_backward.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/tin_shift_backward/tin_shift_backward.cpp index c74834c02..6e0d0e627 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/tin_shift_backward/tin_shift_backward.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/tin_shift_backward/tin_shift_backward.cpp @@ -55,12 +55,12 @@ void TinShiftBackwardExecutor::cpuCompute() { auto x = tensor_desc_[0].tensor; auto x1 = tensor_desc_[1].tensor; auto count1 = parser_->getInputDataCount(0); - int batch_size = x->dims[0]; - int t_size = x->dims[1]; - int channels = x->dims[2]; - int hw_size = x->dims[3]; - int group_batch = x1->dims[0]; - int group_size = x1->dims[1]; + int batch_size = x->getDimIndex(0); + int t_size = x->getDimIndex(1); + int channels = x->getDimIndex(2); + int hw_size = x->getDimIndex(3); + int group_batch = x1->getDimIndex(0); + int group_size = x1->getDimIndex(1); int group_channel = channels / group_size; for (int index = 0; index < count1; index++) { diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/tin_shift_forward/tin_shift_forward.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/tin_shift_forward/tin_shift_forward.cpp index 4c65b8f88..ddcccc2f4 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/tin_shift_forward/tin_shift_forward.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/tin_shift_forward/tin_shift_forward.cpp @@ -55,12 +55,12 @@ void TinShiftForwardExecutor::cpuCompute() { auto x = tensor_desc_[0].tensor; auto x1 = tensor_desc_[1].tensor; auto count1 = parser_->getInputDataCount(0); - int batch_size = x->dims[0]; - int t_size = x->dims[1]; - int channels = x->dims[2]; - int hw_size = x->dims[3]; - int group_batch = x1->dims[0]; - int group_size = x1->dims[1]; + int batch_size = x->getDimIndex(0); + int t_size = x->getDimIndex(1); + int channels = x->getDimIndex(2); + int hw_size = x->getDimIndex(3); + int group_batch = x1->getDimIndex(0); + int group_size = x1->getDimIndex(1); int group_channel = channels / group_size; for (int index = 0; index < count1; index++) { diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/voxelization/voxelization.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/voxelization/voxelization.cpp index f8338257a..8b99c7f22 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/voxelization/voxelization.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/voxelization/voxelization.cpp @@ -329,8 +329,8 @@ void VoxelizationExecutor::cpuCompute() { parser_->getProtoNode()->voxelization_param().deterministic(); auto tensor_points = tensor_desc_[0].tensor; - size_t num_points = tensor_points->dims[0]; - size_t num_features = tensor_points->dims[1]; + size_t num_points = tensor_points->getDimIndex(0); + size_t num_features = tensor_points->getDimIndex(1); float *points = cpu_fp32_input_[0]; float *voxel_size = cpu_fp32_input_[1]; @@ -359,8 +359,8 @@ int64_t VoxelizationExecutor::getTheoryIoSize() { parser_->getProtoNode()->voxelization_param().max_voxels(); auto tensor_points = tensor_desc_[0].tensor; - size_t num_points = tensor_points->dims[0]; - size_t num_features = tensor_points->dims[1]; + size_t num_points = tensor_points->getDimIndex(0); + size_t num_features = tensor_points->getDimIndex(1); int64_t total_size = 0; // mluOpUnionKernelDynamicVoxelize @@ -387,7 +387,7 @@ int64_t VoxelizationExecutor::getTheoryIoSize() { int64_t VoxelizationExecutor::getTheoryOps() { auto tensor_points = tensor_desc_[0].tensor; - size_t num_points = tensor_points->dims[0]; + size_t num_points = tensor_points->getDimIndex(0); int64_t theory_ops = 0; int32_t cp_count = 31; diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/yolo_box/yolo_box.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/yolo_box/yolo_box.cpp index f86e06dac..f62ce9814 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/yolo_box/yolo_box.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/yolo_box/yolo_box.cpp @@ -164,9 +164,9 @@ void YoloBoxExecutor::cpuCompute() { memset(boxes_data, 0, boxes_size); memset(scores_data, 0, scores_size); - const int n = x_desc->dims[0]; - const int h = x_desc->dims[2]; - const int w = x_desc->dims[3]; + const int n = x_desc->getDimIndex(0); + const int h = x_desc->getDimIndex(2); + const int w = x_desc->getDimIndex(3); auto anchors_desc = tensor_desc_[2].tensor; uint64_t anchors_tensor_num = mluOpGetTensorElementNum(anchors_desc); const int an_num = anchors_tensor_num / 2;