diff --git a/src/components/tl/ucp/allgather/allgather_knomial.c b/src/components/tl/ucp/allgather/allgather_knomial.c index 69180bbf78..2256677d12 100644 --- a/src/components/tl/ucp/allgather/allgather_knomial.c +++ b/src/components/tl/ucp/allgather/allgather_knomial.c @@ -260,8 +260,10 @@ ucc_status_t register_memory(ucc_coll_task_t *coll_task){ size_t dt_size = ucc_dt_size(args->dst.info.datatype); size_t data_size = count * dt_size; ucc_rank_t size = task->subset.map.ep_num; + ucc_info("size : %d", size); ucc_rank_t broot = args->coll_type == UCC_COLL_TYPE_BCAST ? args->root : 0; + ucc_info("coll_type : %d", args->coll_type); ucc_rank_t rank = VRANK(task->subset.myrank, broot, size); size_t local = GET_LOCAL_COUNT(args, size, rank); void *sbuf; @@ -302,23 +304,25 @@ ucc_status_t register_memory(ucc_coll_task_t *coll_task){ mmap_params.memory_type = ucc_memtype_to_ucs[mem_type]; if (KN_NODE_EXTRA == node_type) { if (p->type != KN_PATTERN_ALLGATHERX) { + ucc_info("1 : extra node"); mmap_params.address = task->allgather_kn.sbuf; mmap_params.length = local * dt_size; - MEM_MAP(); + MEM_MAP("1"); } mmap_params.address = rbuf; mmap_params.length = data_size; - MEM_MAP(); + MEM_MAP("2"); } if ((p->type != KN_PATTERN_ALLGATHERX) && (node_type == KN_NODE_PROXY)) { + ucc_info("2 : inside proxy"); peer = ucc_knomial_pattern_get_extra(p, rank); extra_count = GET_LOCAL_COUNT(args, size, peer); peer = ucc_ep_map_eval(task->subset.map, peer); mmap_params.address = PTR_OFFSET(task->allgather_kn.sbuf, local * dt_size); mmap_params.length = extra_count * dt_size; - MEM_MAP(); + MEM_MAP("3"); } if (KN_NODE_EXTRA == node_type) { @@ -341,7 +345,7 @@ ucc_status_t register_memory(ucc_coll_task_t *coll_task){ } mmap_params.address = sbuf; mmap_params.length = local_seg_count * dt_size; - MEM_MAP(); + MEM_MAP("4"); } for (loop_step = 1; loop_step < radix; loop_step++) { @@ -360,15 +364,16 @@ ucc_status_t register_memory(ucc_coll_task_t *coll_task){ } mmap_params.address = PTR_OFFSET(rbuf, peer_seg_offset * dt_size); mmap_params.length = peer_seg_count * dt_size; - MEM_MAP(); + MEM_MAP("5"); } ucc_kn_ag_pattern_next_iter(p); } if (KN_NODE_PROXY == node_type) { + ucc_info("3 : proxy"); mmap_params.address = args->dst.info.buffer; mmap_params.length = data_size; - MEM_MAP(); + MEM_MAP("6"); } out: @@ -394,6 +399,7 @@ ucc_status_t ucc_tl_ucp_allgather_knomial_finalize(ucc_coll_task_t *coll_task){ if (status < 0){ tl_error(UCC_TASK_LIB(task), "failed to initialize ucc_mpool"); + return status; } return UCC_OK; @@ -416,6 +422,7 @@ ucc_status_t ucc_tl_ucp_allgather_knomial_init_r( if (status < 0){ tl_error(UCC_TASK_LIB(task), "failed to initialize ucc_mpool"); + return status; } if (tl_team->cfg.use_reordering && @@ -434,6 +441,7 @@ ucc_status_t ucc_tl_ucp_allgather_knomial_init_r( if (status < 0){ tl_error(UCC_TASK_LIB(task), "failed to register memory"); + return status; } *task_h = &task->super; return UCC_OK; diff --git a/src/components/tl/ucp/tl_ucp_coll.h b/src/components/tl/ucp/tl_ucp_coll.h index 4347ab2874..015454582c 100644 --- a/src/components/tl/ucp/tl_ucp_coll.h +++ b/src/components/tl/ucp/tl_ucp_coll.h @@ -57,9 +57,10 @@ void ucc_tl_ucp_team_default_score_str_free( } \ } while(0) -#define MEM_MAP() do { \ +#define MEM_MAP(index) do { \ status = ucs_status_to_ucc_status(ucp_mem_map(ctx->worker.ucp_context, &mmap_params, &mh_list[count_mh++])); \ if (UCC_OK != status) { \ + tl_error(UCC_TASK_LIB(task), "mem_map failed : %s, length : %ld, mmap_params.address : %p", index, mmap_params.length, mmap_params.address); \ return status; \ } \ if (count_mh == size_of_list){ \