diff --git a/LICENSE b/LICENSE index f710c15..4de2934 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2020 Intel Corportation +Copyright (C) Intel Corportation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md index 44d7f6f..f9fdc4f 100644 --- a/README.md +++ b/README.md @@ -42,22 +42,25 @@ You may obtain a copy of the License at https://opensource.org/licenses/MIT 6. Code Annotation - based on [Instrumentation and Tracing Technology API (ITT API)](chapters/code_annotation/ITT.md) -## Profiling Tools -- for OpenCL(TM), DPC++ (with OpenCL(TM) backend) and OpenMP* (with OpenCL(TM) backend): +## Profiling & Debug Tools +- unified tools: + - [onetrace](tools/onetrace) - host and device tracing tool for OpenCL(TM) and Level Zero backends with support of DPC++ (both for CPU and GPU) and OpenMP* GPU offload; +- tools for OpenCL(TM), DPC++ (with OpenCL(TM) backend) and OpenMP* GPU offload (with OpenCL(TM) backend): - [cl_hot_functions](samples/cl_hot_functions) - provides a list of hottest OpenCL(TM) API calls by backend (CPU and GPU); - [cl_hot_kernels](samples/cl_hot_kernels) - provides a list of hottest OpenCL(TM) kernels by backend (CPU and GPU); - [cl_debug_info](samples/cl_debug_info) - prints source and assembly (GEN ISA) for kernels on GPU; - [cl_gpu_metrics](samples/cl_gpu_metrics) - provides a list of hottest OpenCL(TM) GPU kernels along with percent of cycles it was active, stall and idle; -- for Level Zero, DPC++ (with Level Zero backend) and OpenMP* (with Level Zero backend): + - [cl_tracer](samples/cl_tracer) - "Swiss army knife" for OpenCL(TM) API call tracing and profiling; +- tools for Level Zero, DPC++ (with Level Zero backend) and OpenMP* GPU offload (with Level Zero backend): - [ze_hot_functions](samples/ze_hot_functions) - provides a list of hottest Level Zero API calls; - [ze_hot_kernels](samples/ze_hot_kernels) - provides a list of hottest Level Zero kernels; - [ze_debug_info](samples/ze_debug_info) - prints source and assembly (GEN ISA) for kernels on GPU; - [ze_metric_query](samples/ze_metric_query) - provides a list of hottest Level Zero GPU kernels along with percent of cycles it was active, stall and idle (metrics are collected in *query* mode); - [ze_metric_streamer](samples/ze_metric_query) - provides a list of hottest Level Zero GPU kernels along with percent of cycles it was active, stall and idle (metrics are collected in *streamer* mode); - - [ze_tracer](samples/ze_tracer) - "Swiss army knife" for Level Zero profiling (former ze_intercept); -- for OpenMP* (with any backend): + - [ze_tracer](samples/ze_tracer) - "Swiss army knife" for Level Zero API call tracing and profiling (former ze_intercept); +- tools for OpenMP*: - [omp_hot_regions](samples/omp_hot_regions) - provides a list of hottest parallel (for CPU) and target (for GPU) OpenMP* regions; -- unified tools for binary instrumentation (for any GPU runtime): +- tools for binary instrumentation: - [gpu_inst_count](samples/gpu_inst_count) - prints GPU kernel assembly (GEN ISA) annotated by instruction execution count; - [gpu_perfmon_read](samples/gpu_perfmon_read) - prints GPU kernel assembly (GEN ISA) annotated by specific HW metric, which is accumulated in EU PerfMon register; diff --git a/VERSION b/VERSION index 142464b..d33c3a2 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.11.0 \ No newline at end of file +0.12.0 \ No newline at end of file diff --git a/chapters/device_activity_tracing/OpenCL.md b/chapters/device_activity_tracing/OpenCL.md index 5189e25..b660d80 100644 --- a/chapters/device_activity_tracing/OpenCL.md +++ b/chapters/device_activity_tracing/OpenCL.md @@ -10,6 +10,8 @@ The same event could be used to get additional profiling information for the dev - time counter in nanoseconds when the command identified by event starts execution on the device (`CL_PROFILING_COMMAND_START`); - time counter in nanoseconds when the command identified by event has finished execution on the device (`CL_PROFILING_COMMAND_END`). +Intel(R) Xeon(R) Processor / Intel(R) Core(TM) Processor (CPU) Runtimes use `QueryPerformanceCounter` on Windows and `CLOCK_MONOTONIC` on Linux as time sources for the counters described above. Intel(R) Graphics Compute Runtime for oneAPI Level Zero and OpenCL(TM) Driver also uses `QueryPerformanceCounter` on Windows but `CLOCK_MONOTONIC_RAW` on Linux. + **Supported Runtimes**: - any OpenCL(TM) 1.0 and above @@ -74,4 +76,5 @@ void CL_CALLBACK EventNotify(cl_event event, ## Samples - [OpenCL(TM) GEMM](../../samples/cl_gemm) - [OpenCL(TM) Hot Kernels](../../samples/cl_hot_kernels) -- [OpenCL(TM) GPU Metrics](../../samples/cl_gpu_metrics) \ No newline at end of file +- [OpenCL(TM) GPU Metrics](../../samples/cl_gpu_metrics) +- [OpenCL(TM) Tracer](../../samples/cl_tracer) \ No newline at end of file diff --git a/chapters/metrics_collection/MetricsDiscoveryAPI.md b/chapters/metrics_collection/MetricsDiscoveryAPI.md index 18e2247..2352783 100644 --- a/chapters/metrics_collection/MetricsDiscoveryAPI.md +++ b/chapters/metrics_collection/MetricsDiscoveryAPI.md @@ -231,7 +231,7 @@ The problem is that metrics timestamp one can get with Intel(R) Metrics Discover In Intel(R) Metrics Discovery Application Programming Interface library there is a function `GetGpuCpuTimestamps` that allows to bind GPU metrics timestamp to some CPU timestamp (which is based on `CLOCK_MONOTONIC` on Linux and `QueryPerformanceCounter` on Windows). -So e.g. to convert GPU metrics timestamp (`gpuTimestamp`) to OpenCL timestamp (`cpu_timestamp`), which is based on `CLOCK_MONOTONIC_RAW` on Linux, one should perform the following steps: +So e.g. to convert GPU metrics timestamp (`gpuTimestamp`) to OpenCL GPU timestamp (`cpu_timestamp`), which is based on `CLOCK_MONOTONIC_RAW` on Linux, one should perform the following steps: 1. Get "time snap point" to correlate GPU and `CLOCK_MONOTONIC` time: ```cpp uint64_t cpu_snap_point = 0, gpu_snap_point = 0; diff --git a/chapters/runtime_api_tracing/OpenCL.md b/chapters/runtime_api_tracing/OpenCL.md index 7b3d28f..feed581 100644 --- a/chapters/runtime_api_tracing/OpenCL.md +++ b/chapters/runtime_api_tracing/OpenCL.md @@ -42,7 +42,7 @@ cl_int CL_API_CALL clGetTracingStateINTEL( **Supported Runtimes**: - [Intel(R) Graphics Compute Runtime for oneAPI Level Zero and OpenCL(TM) Driver](https://github.com/intel/compute-runtime) -- Intel(R) CPU Runtime for OpenCL(TM) Applications +- [Intel(R) Xeon(R) Processor / Intel(R) Core(TM) Processor (CPU) Runtimes](https://software.intel.com/en-us/articles/opencl-drivers#cpu-section) **Supported OS**: - Linux @@ -115,4 +115,5 @@ void Callback(cl_function_id fid, - [OpenCL(TM) Hot Functions](../../samples/cl_hot_functions) - [OpenCL(TM) Hot Kernels](../../samples/cl_hot_kernels) - [OpenCL(TM) Debug Info](../../samples/cl_debug_info) -- [OpenCL(TM) GPU Metrics](../../samples/cl_gpu_metrics) \ No newline at end of file +- [OpenCL(TM) GPU Metrics](../../samples/cl_gpu_metrics) +- [OpenCL(TM) Tracer](../../samples/cl_tracer) \ No newline at end of file diff --git a/samples/build_utils/CMakeLists.txt b/samples/build_utils/CMakeLists.txt index 2bcf563..ca0378b 100644 --- a/samples/build_utils/CMakeLists.txt +++ b/samples/build_utils/CMakeLists.txt @@ -1,3 +1,5 @@ +set(PTI_CMAKE_MACRO_DIR ${CMAKE_CURRENT_LIST_DIR} CACHE INTERNAL "") + macro(SetRequiredCMakeVersion) set(REQUIRED_CMAKE_VERSION 2.8) endmacro() @@ -27,38 +29,6 @@ macro(SetBuildType) endif() endmacro() -macro(CheckForIntelCompiler) - if(WIN32) - set(INTEL_COMPILER_NAME "icl.exe") - else() - set(INTEL_COMPILER_NAME "icpx") - endif() - get_filename_component(COMPILER_NAME ${CMAKE_CXX_COMPILER} NAME) - if(COMPILER_NAME STREQUAL ${INTEL_COMPILER_NAME}) - message(STATUS "Intel(R) C++ Compiler is used") - else() - message(FATAL_ERROR - "Intel(R) C++ Compiler is required. " - "Use \"CXX=${INTEL_COMPILER_NAME} cmake ..\" command to configure the sample.") - endif() -endmacro() - -macro(CheckForDPCCompiler) - if(WIN32) - set(INTEL_COMPILER_NAME "dpcpp.exe") - else() - set(INTEL_COMPILER_NAME "dpcpp") - endif() - get_filename_component(COMPILER_NAME ${CMAKE_CXX_COMPILER} NAME) - if(COMPILER_NAME STREQUAL ${INTEL_COMPILER_NAME}) - message(STATUS "Intel(R) oneAPI DPC++ Compiler is used") - else() - message(FATAL_ERROR - "Intel(R) oneAPI DPC++ Compiler is required. " - "Use \"CXX=${INTEL_COMPILER_NAME} cmake ..\" command to configure the sample.") - endif() -endmacro() - macro(FindOpenCLLibrary TARGET) if(DEFINED ENV{LD_LIBRARY_PATH}) string(REPLACE ":" ";" SEARCH_LIB_PATH $ENV{LD_LIBRARY_PATH}) @@ -102,7 +72,7 @@ macro(FindOpenCLHeaders TARGET) "${OPENCL_INC_PATH}/CL/cl_gl.h" "${OPENCL_INC_PATH}/CL/cl_version.h" "${OPENCL_INC_PATH}/CL/cl_platform.h" - COMMAND "${PYTHON_EXECUTABLE}" "${PROJECT_SOURCE_DIR}/../build_utils/get_cl_headers.py" "${OPENCL_INC_PATH}" "${CMAKE_BINARY_DIR}") + COMMAND "${PYTHON_EXECUTABLE}" "${PTI_CMAKE_MACRO_DIR}/get_cl_headers.py" "${OPENCL_INC_PATH}" "${CMAKE_BINARY_DIR}") target_include_directories(${TARGET} PUBLIC "${OPENCL_INC_PATH}") @@ -122,7 +92,7 @@ macro(GetOpenCLTracingHeaders TARGET) ${OPENCL_TRACING_INC_PATH}/CL/tracing_types.h) add_custom_command(OUTPUT ${OPENCL_TRACING_INC_PATH}/CL/tracing_api.h ${OPENCL_TRACING_INC_PATH}/CL/tracing_types.h - COMMAND "${PYTHON_EXECUTABLE}" "${PROJECT_SOURCE_DIR}/../build_utils/get_cl_tracing_headers.py" ${OPENCL_TRACING_INC_PATH} ${CMAKE_BINARY_DIR}) + COMMAND "${PYTHON_EXECUTABLE}" "${PTI_CMAKE_MACRO_DIR}/get_cl_tracing_headers.py" ${OPENCL_TRACING_INC_PATH} ${CMAKE_BINARY_DIR}) target_include_directories(${TARGET} PUBLIC "${OPENCL_TRACING_INC_PATH}") @@ -149,7 +119,7 @@ macro(GetITT TARGET) ${ITT_INC_PATH}/ITT/ittnotify_types.h ${ITT_INC_PATH}/ITT/ittnotify.h ${ITT_INC_PATH}/ITT/legacy/ittnotify.h - COMMAND "${PYTHON_EXECUTABLE}" "${PROJECT_SOURCE_DIR}/../build_utils/get_itt.py" ${ITT_INC_PATH} ${CMAKE_BINARY_DIR}) + COMMAND "${PYTHON_EXECUTABLE}" "${PTI_CMAKE_MACRO_DIR}/get_itt.py" ${ITT_INC_PATH} ${CMAKE_BINARY_DIR}) target_include_directories(${TARGET} PUBLIC "${ITT_INC_PATH}") @@ -226,7 +196,7 @@ macro(GetIGAHeaders TARGET) ${IGA_INC_PATH}/IGA/iga_bxml_enums.hpp ${IGA_INC_PATH}/IGA/kv.h ${IGA_INC_PATH}/IGA/kv.hpp - COMMAND "${PYTHON_EXECUTABLE}" "${PROJECT_SOURCE_DIR}/../build_utils/get_iga_headers.py" ${IGA_INC_PATH} ${CMAKE_BINARY_DIR}) + COMMAND "${PYTHON_EXECUTABLE}" "${PTI_CMAKE_MACRO_DIR}/get_iga_headers.py" ${IGA_INC_PATH} ${CMAKE_BINARY_DIR}) target_include_directories(${TARGET} PUBLIC "${IGA_INC_PATH}") @@ -243,7 +213,7 @@ macro(GetIGCHeaders TARGET) ${IGC_INC_PATH}/IGC/patch_list.h) add_custom_command(OUTPUT ${IGC_INC_PATH}/IGC/program_debug_data.h ${IGC_INC_PATH}/IGC/patch_list.h - COMMAND "${PYTHON_EXECUTABLE}" "${PROJECT_SOURCE_DIR}/../build_utils/get_igc_headers.py" ${IGC_INC_PATH} ${CMAKE_BINARY_DIR}) + COMMAND "${PYTHON_EXECUTABLE}" "${PTI_CMAKE_MACRO_DIR}/get_igc_headers.py" ${IGC_INC_PATH} ${CMAKE_BINARY_DIR}) target_include_directories(${TARGET} PUBLIC "${IGC_INC_PATH}") @@ -324,7 +294,7 @@ macro(GetMDHeaders TARGET) ${MD_INC_PATH}/MD/metrics_discovery_internal_api.h) add_custom_command(OUTPUT ${MD_INC_PATH}/MD/metrics_discovery_api.h ${MD_INC_PATH}/MD/metrics_discovery_internal_api.h - COMMAND "${PYTHON_EXECUTABLE}" "${PROJECT_SOURCE_DIR}/../build_utils/get_md_headers.py" ${MD_INC_PATH} ${CMAKE_BINARY_DIR}) + COMMAND "${PYTHON_EXECUTABLE}" "${PTI_CMAKE_MACRO_DIR}/get_md_headers.py" ${MD_INC_PATH} ${CMAKE_BINARY_DIR}) target_include_directories(${TARGET} PUBLIC "${MD_INC_PATH}") @@ -363,7 +333,7 @@ macro(FindGTPinLibrary TARGET) ${GTPIN_LIB_PATH}/GTPIN/libgtpin_core.so ${GTPIN_LIB_PATH}/GTPIN/libiga_wrapper.so ${GTPIN_LIB_PATH}/GTPIN/libstdc++.so.6 - COMMAND "${PYTHON_EXECUTABLE}" "${PROJECT_SOURCE_DIR}/../build_utils/get_gtpin_libs.py" ${GTPIN_LIB_PATH} ${CMAKE_BINARY_DIR}) + COMMAND "${PYTHON_EXECUTABLE}" "${PTI_CMAKE_MACRO_DIR}/get_gtpin_libs.py" ${GTPIN_LIB_PATH} ${CMAKE_BINARY_DIR}) target_link_libraries(${TARGET} "${GTPIN_LIB_PATH}/GTPIN/libgtpin.so") @@ -420,7 +390,7 @@ macro(GetGTPinHeaders TARGET) ${GTPIN_INC_PATH}/GTPIN/ged/intel64/ged_enum_types.h ${GTPIN_INC_PATH}/GTPIN/ged/intel64/ged.h ${GTPIN_INC_PATH}/GTPIN/ged/intel64/ged_ins_field.h - COMMAND "${PYTHON_EXECUTABLE}" "${PROJECT_SOURCE_DIR}/../build_utils/get_gtpin_headers.py" ${GTPIN_INC_PATH} ${CMAKE_BINARY_DIR}) + COMMAND "${PYTHON_EXECUTABLE}" "${PTI_CMAKE_MACRO_DIR}/get_gtpin_headers.py" ${GTPIN_INC_PATH} ${CMAKE_BINARY_DIR}) target_include_directories(${TARGET} PUBLIC "${GTPIN_INC_PATH}/GTPIN" @@ -486,7 +456,7 @@ macro(FindL0HeadersPath TARGET L0_GEN_SCRIPT) add_custom_target(ze_gen_headers ALL DEPENDS ${L0_GEN_INC_PATH}/tracing.gen) add_custom_command(OUTPUT ${L0_GEN_INC_PATH}/tracing.gen - COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH="${PROJECT_SOURCE_DIR}/../utils" "${PYTHON_EXECUTABLE}" ${L0_GEN_SCRIPT} ${L0_GEN_INC_PATH} "${L0_INC_PATH}/level_zero") + COMMAND "${PYTHON_EXECUTABLE}" ${L0_GEN_SCRIPT} ${L0_GEN_INC_PATH} "${L0_INC_PATH}/level_zero") target_include_directories(${TARGET} PUBLIC "${L0_GEN_INC_PATH}") add_dependencies(${TARGET} diff --git a/samples/cl_debug_info/cl_debug_info_collector.h b/samples/cl_debug_info/cl_debug_info_collector.h index 13187fe..80382da 100644 --- a/samples/cl_debug_info/cl_debug_info_collector.h +++ b/samples/cl_debug_info/cl_debug_info_collector.h @@ -1,5 +1,5 @@ //============================================================== -// Copyright © 2020 Intel Corporation +// Copyright (C) Intel Corporation // // SPDX-License-Identifier: MIT // ============================================================= @@ -15,7 +15,7 @@ #include #include -#include "cl_tracer.h" +#include "cl_api_tracer.h" #include "cl_utils.h" #include "igc_binary_decoder.h" #include "gen_symbols_decoder.h" @@ -52,7 +52,7 @@ class ClDebugInfoCollector { ClDebugInfoCollector* collector = new ClDebugInfoCollector(device); PTI_ASSERT(collector != nullptr); - ClTracer* tracer = new ClTracer(device, Callback, collector); + ClApiTracer* tracer = new ClApiTracer(device, Callback, collector); if (tracer == nullptr || !tracer->IsValid()) { std::cerr << "[WARNING] Unable to create OpenCL tracer " << "for target device" << std::endl; @@ -204,7 +204,7 @@ class ClDebugInfoCollector { PTI_ASSERT(device_ != nullptr); } - void EnableTracing(ClTracer* tracer) { + void EnableTracing(ClApiTracer* tracer) { PTI_ASSERT(tracer != nullptr); tracer_ = tracer; @@ -523,7 +523,7 @@ class ClDebugInfoCollector { } private: // Data - ClTracer* tracer_ = nullptr; + ClApiTracer* tracer_ = nullptr; cl_device_id device_ = nullptr; std::mutex lock_; diff --git a/samples/cl_debug_info/tool.cc b/samples/cl_debug_info/tool.cc index fde05ff..36596e8 100644 --- a/samples/cl_debug_info/tool.cc +++ b/samples/cl_debug_info/tool.cc @@ -1,5 +1,5 @@ //============================================================== -// Copyright © 2019-2020 Intel Corporation +// Copyright (C) Intel Corporation // // SPDX-License-Identifier: MIT // ============================================================= diff --git a/samples/cl_gemm/main.cc b/samples/cl_gemm/main.cc index 9e2e96e..d2770d6 100644 --- a/samples/cl_gemm/main.cc +++ b/samples/cl_gemm/main.cc @@ -1,5 +1,5 @@ //============================================================== -// Copyright © 2019-2020 Intel Corporation +// Copyright (C) Intel Corporation // // SPDX-License-Identifier: MIT // ============================================================= diff --git a/samples/cl_gemm_inst/main.cc b/samples/cl_gemm_inst/main.cc index 3f14026..2fa1b92 100644 --- a/samples/cl_gemm_inst/main.cc +++ b/samples/cl_gemm_inst/main.cc @@ -1,5 +1,5 @@ //============================================================== -// Copyright © 2019-2020 Intel Corporation +// Copyright (C) Intel Corporation // // SPDX-License-Identifier: MIT // ============================================================= diff --git a/samples/cl_gemm_itt/ittnotify.cc b/samples/cl_gemm_itt/ittnotify.cc index 6afafa8..e89bd7e 100644 --- a/samples/cl_gemm_itt/ittnotify.cc +++ b/samples/cl_gemm_itt/ittnotify.cc @@ -1,5 +1,5 @@ //============================================================== -// Copyright © 2019-2020 Intel Corporation +// Copyright (C) Intel Corporation // // SPDX-License-Identifier: MIT // ============================================================= diff --git a/samples/cl_gemm_itt/main.cc b/samples/cl_gemm_itt/main.cc index 70db357..3a40b43 100644 --- a/samples/cl_gemm_itt/main.cc +++ b/samples/cl_gemm_itt/main.cc @@ -1,5 +1,5 @@ //============================================================== -// Copyright © 2019-2020 Intel Corporation +// Copyright (C) Intel Corporation // // SPDX-License-Identifier: MIT // ============================================================= diff --git a/samples/cl_gpu_metrics/CMakeLists.txt b/samples/cl_gpu_metrics/CMakeLists.txt index 36b1bb8..e27656d 100644 --- a/samples/cl_gpu_metrics/CMakeLists.txt +++ b/samples/cl_gpu_metrics/CMakeLists.txt @@ -8,7 +8,10 @@ SetBuildType() # Tool Library -add_library(clt_gpu_metrics SHARED "${PROJECT_SOURCE_DIR}/../loader/init.cc" tool.cc) +add_library(clt_gpu_metrics SHARED + "${PROJECT_SOURCE_DIR}/../utils/trace_guard.cc" + "${PROJECT_SOURCE_DIR}/../loader/init.cc" + tool.cc) target_include_directories(clt_gpu_metrics PRIVATE "${PROJECT_SOURCE_DIR}/../utils" PRIVATE "${PROJECT_SOURCE_DIR}/../cl_hot_kernels") diff --git a/samples/cl_gpu_metrics/cl_metric_collector.h b/samples/cl_gpu_metrics/cl_metric_collector.h index 87d2b63..bdfd497 100644 --- a/samples/cl_gpu_metrics/cl_metric_collector.h +++ b/samples/cl_gpu_metrics/cl_metric_collector.h @@ -1,5 +1,5 @@ //============================================================== -// Copyright © 2020 Intel Corporation +// Copyright (C) Intel Corporation // // SPDX-License-Identifier: MIT // ============================================================= @@ -14,7 +14,7 @@ #include #include -#include "cl_tracer.h" +#include "cl_api_tracer.h" #include "cl_utils.h" #include "metric_device.h" @@ -51,7 +51,7 @@ class ClMetricCollector { new ClMetricCollector(metric_device, group, set); PTI_ASSERT(collector != nullptr); - ClTracer* tracer = new ClTracer(device, Callback, collector); + ClApiTracer* tracer = new ClApiTracer(device, Callback, collector); if (tracer == nullptr || !tracer->IsValid()) { std::cerr << "[WARNING] Unable to create OpenCL tracer " << "for target device" << std::endl; @@ -182,7 +182,7 @@ class ClMetricCollector { PTI_ASSERT(set_ != nullptr); } - void EnableTracing(ClTracer* tracer) { + void EnableTracing(ClApiTracer* tracer) { PTI_ASSERT(tracer != nullptr); tracer_ = tracer; @@ -326,7 +326,7 @@ class ClMetricCollector { } private: // Data - ClTracer* tracer_ = nullptr; + ClApiTracer* tracer_ = nullptr; MetricDevice* device_ = nullptr; md::IConcurrentGroup_1_5* group_ = nullptr; diff --git a/samples/cl_gpu_metrics/tool.cc b/samples/cl_gpu_metrics/tool.cc index 8b91e73..884c994 100644 --- a/samples/cl_gpu_metrics/tool.cc +++ b/samples/cl_gpu_metrics/tool.cc @@ -1,5 +1,5 @@ //============================================================== -// Copyright © 2019-2020 Intel Corporation +// Copyright (C) Intel Corporation // // SPDX-License-Identifier: MIT // ============================================================= @@ -82,7 +82,7 @@ static KernelMap GetKernelMap() { return KernelMap(); } - const KernelIntervalList& kernel_interval_list = + const ClKernelIntervalList& kernel_interval_list = kernel_collector->GetKernelIntervalList(); if (kernel_interval_list.size() == 0) { return KernelMap(); diff --git a/samples/cl_hot_functions/CMakeLists.txt b/samples/cl_hot_functions/CMakeLists.txt index e1a7b40..bb2bf5d 100644 --- a/samples/cl_hot_functions/CMakeLists.txt +++ b/samples/cl_hot_functions/CMakeLists.txt @@ -8,7 +8,10 @@ SetBuildType() # Tool Library -add_library(clt_hot_functions SHARED "${PROJECT_SOURCE_DIR}/../loader/init.cc" tool.cc) +add_library(clt_hot_functions SHARED + "${PROJECT_SOURCE_DIR}/../utils/trace_guard.cc" + "${PROJECT_SOURCE_DIR}/../loader/init.cc" + tool.cc) target_include_directories(clt_hot_functions PRIVATE "${PROJECT_SOURCE_DIR}/../utils") if(CMAKE_INCLUDE_PATH) diff --git a/samples/cl_hot_functions/cl_api_callbacks.h b/samples/cl_hot_functions/cl_api_callbacks.h new file mode 100644 index 0000000..ba3c497 --- /dev/null +++ b/samples/cl_hot_functions/cl_api_callbacks.h @@ -0,0 +1,5476 @@ +//============================================================== +// Copyright (C) Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= + +#ifndef PTI_SAMPLES_CL_HOT_FUNCTIONS_CL_API_CALLBACKS_H_ +#define PTI_SAMPLES_CL_HOT_FUNCTIONS_CL_API_CALLBACKS_H_ + +#include + +static thread_local cl_int current_error = CL_SUCCESS; + +static const char* GetErrorString(cl_int error) { + switch (error) { + case CL_SUCCESS: + return "CL_SUCCESS"; + case CL_DEVICE_NOT_FOUND: + return "CL_DEVICE_NOT_FOUND"; + case CL_DEVICE_NOT_AVAILABLE: + return "CL_DEVICE_NOT_AVAILABLE"; + case CL_COMPILER_NOT_AVAILABLE: + return "CL_COMPILER_NOT_AVAILABLE"; + case CL_MEM_OBJECT_ALLOCATION_FAILURE: + return "CL_MEM_OBJECT_ALLOCATION_FAILURE"; + case CL_OUT_OF_RESOURCES: + return "CL_OUT_OF_RESOURCES"; + case CL_OUT_OF_HOST_MEMORY: + return "CL_OUT_OF_HOST_MEMORY"; + case CL_PROFILING_INFO_NOT_AVAILABLE: + return "CL_PROFILING_INFO_NOT_AVAILABLE"; + case CL_MEM_COPY_OVERLAP: + return "CL_MEM_COPY_OVERLAP"; + case CL_IMAGE_FORMAT_MISMATCH: + return "CL_IMAGE_FORMAT_MISMATCH"; + case CL_IMAGE_FORMAT_NOT_SUPPORTED: + return "CL_IMAGE_FORMAT_NOT_SUPPORTED"; + case CL_BUILD_PROGRAM_FAILURE: + return "CL_BUILD_PROGRAM_FAILURE"; + case CL_MAP_FAILURE: + return "CL_MAP_FAILURE"; + case CL_MISALIGNED_SUB_BUFFER_OFFSET: + return "CL_MISALIGNED_SUB_BUFFER_OFFSET"; + case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST: + return "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST"; + case CL_COMPILE_PROGRAM_FAILURE: + return "CL_COMPILE_PROGRAM_FAILURE"; + case CL_LINKER_NOT_AVAILABLE: + return "CL_LINKER_NOT_AVAILABLE"; + case CL_LINK_PROGRAM_FAILURE: + return "CL_LINK_PROGRAM_FAILURE"; + case CL_DEVICE_PARTITION_FAILED: + return "CL_DEVICE_PARTITION_FAILED"; + case CL_KERNEL_ARG_INFO_NOT_AVAILABLE: + return "CL_KERNEL_ARG_INFO_NOT_AVAILABLE"; + case CL_INVALID_VALUE: + return "CL_INVALID_VALUE"; + case CL_INVALID_DEVICE_TYPE: + return "CL_INVALID_DEVICE_TYPE"; + case CL_INVALID_PLATFORM: + return "CL_INVALID_PLATFORM"; + case CL_INVALID_DEVICE: + return "CL_INVALID_DEVICE"; + case CL_INVALID_CONTEXT: + return "CL_INVALID_CONTEXT"; + case CL_INVALID_QUEUE_PROPERTIES: + return "CL_INVALID_QUEUE_PROPERTIES"; + case CL_INVALID_COMMAND_QUEUE: + return "CL_INVALID_COMMAND_QUEUE"; + case CL_INVALID_HOST_PTR: + return "CL_INVALID_HOST_PTR"; + case CL_INVALID_MEM_OBJECT: + return "CL_INVALID_MEM_OBJECT"; + case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR: + return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR"; + case CL_INVALID_IMAGE_SIZE: + return "CL_INVALID_IMAGE_SIZE"; + case CL_INVALID_SAMPLER: + return "CL_INVALID_SAMPLER"; + case CL_INVALID_BINARY: + return "CL_INVALID_BINARY"; + case CL_INVALID_BUILD_OPTIONS: + return "CL_INVALID_BUILD_OPTIONS"; + case CL_INVALID_PROGRAM: + return "CL_INVALID_PROGRAM"; + case CL_INVALID_PROGRAM_EXECUTABLE: + return "CL_INVALID_PROGRAM_EXECUTABLE"; + case CL_INVALID_KERNEL_NAME: + return "CL_INVALID_KERNEL_NAME"; + case CL_INVALID_KERNEL_DEFINITION: + return "CL_INVALID_KERNEL_DEFINITION"; + case CL_INVALID_KERNEL: + return "CL_INVALID_KERNEL"; + case CL_INVALID_ARG_INDEX: + return "CL_INVALID_ARG_INDEX"; + case CL_INVALID_ARG_VALUE: + return "CL_INVALID_ARG_VALUE"; + case CL_INVALID_ARG_SIZE: + return "CL_INVALID_ARG_SIZE"; + case CL_INVALID_KERNEL_ARGS: + return "CL_INVALID_KERNEL_ARGS"; + case CL_INVALID_WORK_DIMENSION: + return "CL_INVALID_WORK_DIMENSION"; + case CL_INVALID_WORK_GROUP_SIZE: + return "CL_INVALID_WORK_GROUP_SIZE"; + case CL_INVALID_WORK_ITEM_SIZE: + return "CL_INVALID_WORK_ITEM_SIZE"; + case CL_INVALID_GLOBAL_OFFSET: + return "CL_INVALID_GLOBAL_OFFSET"; + case CL_INVALID_EVENT_WAIT_LIST: + return "CL_INVALID_EVENT_WAIT_LIST"; + case CL_INVALID_EVENT: + return "CL_INVALID_EVENT"; + case CL_INVALID_OPERATION: + return "CL_INVALID_OPERATION"; + case CL_INVALID_GL_OBJECT: + return "CL_INVALID_GL_OBJECT"; + case CL_INVALID_BUFFER_SIZE: + return "CL_INVALID_BUFFER_SIZE"; + case CL_INVALID_MIP_LEVEL: + return "CL_INVALID_MIP_LEVEL"; + case CL_INVALID_GLOBAL_WORK_SIZE: + return "CL_INVALID_GLOBAL_WORK_SIZE"; + case CL_INVALID_PROPERTY: + return "CL_INVALID_PROPERTY"; + case CL_INVALID_IMAGE_DESCRIPTOR: + return "CL_INVALID_IMAGE_DESCRIPTOR"; + case CL_INVALID_COMPILER_OPTIONS: + return "CL_INVALID_COMPILER_OPTIONS"; + case CL_INVALID_LINKER_OPTIONS: + return "CL_INVALID_LINKER_OPTIONS"; + case CL_INVALID_DEVICE_PARTITION_COUNT: + return "CL_INVALID_DEVICE_PARTITION_COUNT"; + case CL_INVALID_PIPE_SIZE: + return "CL_INVALID_PIPE_SIZE"; + case CL_INVALID_DEVICE_QUEUE: + return "CL_INVALID_DEVICE_QUEUE"; + default: + break; + } + return "UNKNOWN"; +} + +static void clGetSupportedImageFormatsOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clGetSupportedImageFormats* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " context = " << *(params->context); + stream << " flags = " << *(params->flags); + stream << " imageType = " << *(params->imageType); + stream << " numEntries = " << *(params->numEntries); + stream << " imageFormats = " << *(params->imageFormats); + stream << " numImageFormats = " << *(params->numImageFormats); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clGetSupportedImageFormatsOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + cl_int* error = reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(error != nullptr); + + stream << " -> " << GetErrorString(*error); + stream << " (" << *error << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clGetKernelInfoOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clGetKernelInfo* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " kernel = " << *(params->kernel); + stream << " paramName = " << *(params->paramName); + stream << " paramValueSize = " << *(params->paramValueSize); + stream << " paramValue = " << *(params->paramValue); + stream << " paramValueSizeRet = " << *(params->paramValueSizeRet); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clGetKernelInfoOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + cl_int* error = reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(error != nullptr); + + stream << " -> " << GetErrorString(*error); + stream << " (" << *error << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clCompileProgramOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clCompileProgram* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " program = " << *(params->program); + stream << " numDevices = " << *(params->numDevices); + stream << " deviceList = " << *(params->deviceList); + if (*(params->options) == nullptr) { + stream << " options = " << "0"; + } else if (strlen(*(params->options)) == 0) { + stream << " options = \"\""; + } else { + stream << " options = \"" << *(params->options) << "\""; + } + stream << " numInputHeaders = " << *(params->numInputHeaders); + stream << " inputHeaders = " << *(params->inputHeaders); + stream << " headerIncludeNames = " << *(params->headerIncludeNames); + stream << " funcNotify = " << *(params->funcNotify); + stream << " userData = " << *(params->userData); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clCompileProgramOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + cl_int* error = reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(error != nullptr); + + stream << " -> " << GetErrorString(*error); + stream << " (" << *error << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clSetEventCallbackOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clSetEventCallback* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " event = " << *(params->event); + stream << " commandExecCallbackType = " << + *(params->commandExecCallbackType); + stream << " funcNotify = " << *(params->funcNotify); + stream << " userData = " << *(params->userData); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clSetEventCallbackOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + cl_int* error = reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(error != nullptr); + + stream << " -> " << GetErrorString(*error); + stream << " (" << *error << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clUnloadPlatformCompilerOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clUnloadPlatformCompiler* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " platform = " << *(params->platform); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clUnloadPlatformCompilerOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + cl_int* error = reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(error != nullptr); + + stream << " -> " << GetErrorString(*error); + stream << " (" << *error << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clGetPlatformIDsOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clGetPlatformIDs* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " numEntries = " << *(params->numEntries); + stream << " platforms = " << *(params->platforms); + stream << " numPlatforms = " << *(params->numPlatforms); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clGetPlatformIDsOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + cl_int* error = reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(error != nullptr); + + stream << " -> " << GetErrorString(*error); + stream << " (" << *error << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clUnloadCompilerOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clUnloadCompiler* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clUnloadCompilerOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + cl_int* error = reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(error != nullptr); + + stream << " -> " << GetErrorString(*error); + stream << " (" << *error << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clEnqueueBarrierWithWaitListOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clEnqueueBarrierWithWaitList* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " commandQueue = " << *(params->commandQueue); + stream << " numEventsInWaitList = " << *(params->numEventsInWaitList); + stream << " eventWaitList = " << *(params->eventWaitList); + stream << " event = " << *(params->event); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clEnqueueBarrierWithWaitListOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + cl_int* error = reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(error != nullptr); + + stream << " -> " << GetErrorString(*error); + stream << " (" << *error << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clEnqueueMapBufferOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clEnqueueMapBuffer* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " commandQueue = " << *(params->commandQueue); + stream << " buffer = " << *(params->buffer); + stream << " blockingMap = " << *(params->blockingMap); + stream << " mapFlags = " << *(params->mapFlags); + stream << " offset = " << *(params->offset); + stream << " cb = " << *(params->cb); + stream << " numEventsInWaitList = " << *(params->numEventsInWaitList); + stream << " eventWaitList = " << *(params->eventWaitList); + stream << " event = " << *(params->event); + stream << " errcodeRet = " << *(params->errcodeRet); + stream << std::endl; + + std::cerr << stream.str(); + + if (*(params->errcodeRet) == nullptr) { + *(params->errcodeRet) = ¤t_error; + } +} + +static void clEnqueueMapBufferOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + const cl_params_clEnqueueMapBuffer* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + void ** result = + reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(result != nullptr); + stream << " result = " << *result; + + PTI_ASSERT(*(params->errcodeRet) != nullptr); + stream << " -> " << GetErrorString(**(params->errcodeRet)); + stream << " (" << **(params->errcodeRet) << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clCreateImage3DOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clCreateImage3D* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " context = " << *(params->context); + stream << " flags = " << *(params->flags); + stream << " imageFormat = " << *(params->imageFormat); + stream << " imageWidth = " << *(params->imageWidth); + stream << " imageHeight = " << *(params->imageHeight); + stream << " imageDepth = " << *(params->imageDepth); + stream << " imageRowPitch = " << *(params->imageRowPitch); + stream << " imageSlicePitch = " << *(params->imageSlicePitch); + stream << " hostPtr = " << *(params->hostPtr); + stream << " errcodeRet = " << *(params->errcodeRet); + stream << std::endl; + + std::cerr << stream.str(); + + if (*(params->errcodeRet) == nullptr) { + *(params->errcodeRet) = ¤t_error; + } +} + +static void clCreateImage3DOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + const cl_params_clCreateImage3D* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + cl_mem* result = + reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(result != nullptr); + stream << " result = " << *result; + + PTI_ASSERT(*(params->errcodeRet) != nullptr); + stream << " -> " << GetErrorString(**(params->errcodeRet)); + stream << " (" << **(params->errcodeRet) << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clGetKernelArgInfoOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clGetKernelArgInfo* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " kernel = " << *(params->kernel); + stream << " argIndx = " << *(params->argIndx); + stream << " paramName = " << *(params->paramName); + stream << " paramValueSize = " << *(params->paramValueSize); + stream << " paramValue = " << *(params->paramValue); + stream << " paramValueSizeRet = " << *(params->paramValueSizeRet); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clGetKernelArgInfoOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + cl_int* error = reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(error != nullptr); + + stream << " -> " << GetErrorString(*error); + stream << " (" << *error << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clEnqueueSVMFreeOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clEnqueueSVMFree* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " commandQueue = " << *(params->commandQueue); + stream << " numSvmPointers = " << *(params->numSvmPointers); + stream << " svmPointers = " << *(params->svmPointers); + stream << " pfnFreeFunc = " << *(params->pfnFreeFunc); + stream << " userData = " << *(params->userData); + stream << " numEventsInWaitList = " << *(params->numEventsInWaitList); + stream << " eventWaitList = " << *(params->eventWaitList); + stream << " event = " << *(params->event); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clEnqueueSVMFreeOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + cl_int* error = reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(error != nullptr); + + stream << " -> " << GetErrorString(*error); + stream << " (" << *error << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clEnqueueCopyImageToBufferOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clEnqueueCopyImageToBuffer* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " commandQueue = " << *(params->commandQueue); + stream << " srcImage = " << *(params->srcImage); + stream << " dstBuffer = " << *(params->dstBuffer); + stream << " srcOrigin = " << *(params->srcOrigin); + stream << " region = " << *(params->region); + stream << " dstOffset = " << *(params->dstOffset); + stream << " numEventsInWaitList = " << *(params->numEventsInWaitList); + stream << " eventWaitList = " << *(params->eventWaitList); + stream << " event = " << *(params->event); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clEnqueueCopyImageToBufferOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + cl_int* error = reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(error != nullptr); + + stream << " -> " << GetErrorString(*error); + stream << " (" << *error << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clGetContextInfoOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clGetContextInfo* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " context = " << *(params->context); + stream << " paramName = " << *(params->paramName); + stream << " paramValueSize = " << *(params->paramValueSize); + stream << " paramValue = " << *(params->paramValue); + stream << " paramValueSizeRet = " << *(params->paramValueSizeRet); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clGetContextInfoOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + cl_int* error = reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(error != nullptr); + + stream << " -> " << GetErrorString(*error); + stream << " (" << *error << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clRetainCommandQueueOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clRetainCommandQueue* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " commandQueue = " << *(params->commandQueue); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clRetainCommandQueueOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + cl_int* error = reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(error != nullptr); + + stream << " -> " << GetErrorString(*error); + stream << " (" << *error << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clEnqueueWriteImageOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clEnqueueWriteImage* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " commandQueue = " << *(params->commandQueue); + stream << " image = " << *(params->image); + stream << " blockingWrite = " << *(params->blockingWrite); + stream << " origin = " << *(params->origin); + stream << " region = " << *(params->region); + stream << " inputRowPitch = " << *(params->inputRowPitch); + stream << " inputSlicePitch = " << *(params->inputSlicePitch); + stream << " ptr = " << *(params->ptr); + stream << " numEventsInWaitList = " << *(params->numEventsInWaitList); + stream << " eventWaitList = " << *(params->eventWaitList); + stream << " event = " << *(params->event); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clEnqueueWriteImageOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + cl_int* error = reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(error != nullptr); + + stream << " -> " << GetErrorString(*error); + stream << " (" << *error << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clEnqueueWaitForEventsOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clEnqueueWaitForEvents* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " commandQueue = " << *(params->commandQueue); + stream << " numEvents = " << *(params->numEvents); + stream << " eventList = " << *(params->eventList); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clEnqueueWaitForEventsOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + cl_int* error = reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(error != nullptr); + + stream << " -> " << GetErrorString(*error); + stream << " (" << *error << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clEnqueueSVMUnmapOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clEnqueueSVMUnmap* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " commandQueue = " << *(params->commandQueue); + stream << " svmPtr = " << *(params->svmPtr); + stream << " numEventsInWaitList = " << *(params->numEventsInWaitList); + stream << " eventWaitList = " << *(params->eventWaitList); + stream << " event = " << *(params->event); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clEnqueueSVMUnmapOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + cl_int* error = reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(error != nullptr); + + stream << " -> " << GetErrorString(*error); + stream << " (" << *error << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clCreateProgramWithBinaryOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clCreateProgramWithBinary* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " context = " << *(params->context); + stream << " numDevices = " << *(params->numDevices); + stream << " deviceList = " << *(params->deviceList); + stream << " lengths = " << *(params->lengths); + stream << " binaries = " << *(params->binaries); + stream << " binaryStatus = " << *(params->binaryStatus); + stream << " errcodeRet = " << *(params->errcodeRet); + stream << std::endl; + + std::cerr << stream.str(); + + if (*(params->errcodeRet) == nullptr) { + *(params->errcodeRet) = ¤t_error; + } +} + +static void clCreateProgramWithBinaryOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + const cl_params_clCreateProgramWithBinary* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + cl_program* result = + reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(result != nullptr); + stream << " result = " << *result; + + PTI_ASSERT(*(params->errcodeRet) != nullptr); + stream << " -> " << GetErrorString(**(params->errcodeRet)); + stream << " (" << **(params->errcodeRet) << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clEnqueueFillImageOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clEnqueueFillImage* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " commandQueue = " << *(params->commandQueue); + stream << " image = " << *(params->image); + stream << " fillColor = " << *(params->fillColor); + stream << " origin = " << *(params->origin); + stream << " region = " << *(params->region); + stream << " numEventsInWaitList = " << *(params->numEventsInWaitList); + stream << " eventWaitList = " << *(params->eventWaitList); + stream << " event = " << *(params->event); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clEnqueueFillImageOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + cl_int* error = reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(error != nullptr); + + stream << " -> " << GetErrorString(*error); + stream << " (" << *error << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clCreateFromGLTexture2DOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clCreateFromGLTexture2D* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " context = " << *(params->context); + stream << " flags = " << *(params->flags); + stream << " target = " << *(params->target); + stream << " miplevel = " << *(params->miplevel); + stream << " texture = " << *(params->texture); + stream << " errcodeRet = " << *(params->errcodeRet); + stream << std::endl; + + std::cerr << stream.str(); + + if (*(params->errcodeRet) == nullptr) { + *(params->errcodeRet) = ¤t_error; + } +} + +static void clCreateFromGLTexture2DOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + const cl_params_clCreateFromGLTexture2D* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + cl_mem* result = + reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(result != nullptr); + stream << " result = " << *result; + + PTI_ASSERT(*(params->errcodeRet) != nullptr); + stream << " -> " << GetErrorString(**(params->errcodeRet)); + stream << " (" << **(params->errcodeRet) << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clSetKernelExecInfoOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clSetKernelExecInfo* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " kernel = " << *(params->kernel); + stream << " paramName = " << *(params->paramName); + stream << " paramValueSize = " << *(params->paramValueSize); + stream << " paramValue = " << *(params->paramValue); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clSetKernelExecInfoOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + cl_int* error = reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(error != nullptr); + + stream << " -> " << GetErrorString(*error); + stream << " (" << *error << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clEnqueueReleaseGLObjectsOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clEnqueueReleaseGLObjects* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " commandQueue = " << *(params->commandQueue); + stream << " numObjects = " << *(params->numObjects); + stream << " memObjects = " << *(params->memObjects); + stream << " numEventsInWaitList = " << *(params->numEventsInWaitList); + stream << " eventWaitList = " << *(params->eventWaitList); + stream << " event = " << *(params->event); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clEnqueueReleaseGLObjectsOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + cl_int* error = reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(error != nullptr); + + stream << " -> " << GetErrorString(*error); + stream << " (" << *error << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clGetDeviceIDsOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clGetDeviceIDs* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " platform = " << *(params->platform); + stream << " deviceType = " << *(params->deviceType); + stream << " numEntries = " << *(params->numEntries); + stream << " devices = " << *(params->devices); + stream << " numDevices = " << *(params->numDevices); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clGetDeviceIDsOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + cl_int* error = reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(error != nullptr); + + stream << " -> " << GetErrorString(*error); + stream << " (" << *error << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clReleaseMemObjectOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clReleaseMemObject* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " memobj = " << *(params->memobj); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clReleaseMemObjectOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + cl_int* error = reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(error != nullptr); + + stream << " -> " << GetErrorString(*error); + stream << " (" << *error << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clGetGLObjectInfoOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clGetGLObjectInfo* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " memobj = " << *(params->memobj); + stream << " glObjectType = " << *(params->glObjectType); + stream << " glObjectName = " << *(params->glObjectName); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clGetGLObjectInfoOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + cl_int* error = reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(error != nullptr); + + stream << " -> " << GetErrorString(*error); + stream << " (" << *error << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clCreateFromGLRenderbufferOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clCreateFromGLRenderbuffer* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " context = " << *(params->context); + stream << " flags = " << *(params->flags); + stream << " renderbuffer = " << *(params->renderbuffer); + stream << " errcodeRet = " << *(params->errcodeRet); + stream << std::endl; + + std::cerr << stream.str(); + + if (*(params->errcodeRet) == nullptr) { + *(params->errcodeRet) = ¤t_error; + } +} + +static void clCreateFromGLRenderbufferOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + const cl_params_clCreateFromGLRenderbuffer* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + cl_mem* result = + reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(result != nullptr); + stream << " result = " << *result; + + PTI_ASSERT(*(params->errcodeRet) != nullptr); + stream << " -> " << GetErrorString(**(params->errcodeRet)); + stream << " (" << **(params->errcodeRet) << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clReleaseContextOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clReleaseContext* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " context = " << *(params->context); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clReleaseContextOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + cl_int* error = reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(error != nullptr); + + stream << " -> " << GetErrorString(*error); + stream << " (" << *error << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clEnqueueUnmapMemObjectOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clEnqueueUnmapMemObject* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " commandQueue = " << *(params->commandQueue); + stream << " memobj = " << *(params->memobj); + stream << " mappedPtr = " << *(params->mappedPtr); + stream << " numEventsInWaitList = " << *(params->numEventsInWaitList); + stream << " eventWaitList = " << *(params->eventWaitList); + stream << " event = " << *(params->event); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clEnqueueUnmapMemObjectOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + cl_int* error = reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(error != nullptr); + + stream << " -> " << GetErrorString(*error); + stream << " (" << *error << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clCreateContextOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clCreateContext* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " properties = " << *(params->properties); + stream << " numDevices = " << *(params->numDevices); + stream << " devices = " << *(params->devices); + stream << " funcNotify = " << *(params->funcNotify); + stream << " userData = " << *(params->userData); + stream << " errcodeRet = " << *(params->errcodeRet); + stream << std::endl; + + std::cerr << stream.str(); + + if (*(params->errcodeRet) == nullptr) { + *(params->errcodeRet) = ¤t_error; + } +} + +static void clCreateContextOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + const cl_params_clCreateContext* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + cl_context* result = + reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(result != nullptr); + stream << " result = " << *result; + + PTI_ASSERT(*(params->errcodeRet) != nullptr); + stream << " -> " << GetErrorString(**(params->errcodeRet)); + stream << " (" << **(params->errcodeRet) << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clGetHostTimerOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clGetHostTimer* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " device = " << *(params->device); + stream << " hostTimestamp = " << *(params->hostTimestamp); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clGetHostTimerOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + cl_int* error = reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(error != nullptr); + + stream << " -> " << GetErrorString(*error); + stream << " (" << *error << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clGetPipeInfoOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clGetPipeInfo* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " pipe = " << *(params->pipe); + stream << " paramName = " << *(params->paramName); + stream << " paramValueSize = " << *(params->paramValueSize); + stream << " paramValue = " << *(params->paramValue); + stream << " paramValueSizeRet = " << *(params->paramValueSizeRet); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clGetPipeInfoOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + cl_int* error = reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(error != nullptr); + + stream << " -> " << GetErrorString(*error); + stream << " (" << *error << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clEnqueueAcquireGLObjectsOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clEnqueueAcquireGLObjects* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " commandQueue = " << *(params->commandQueue); + stream << " numObjects = " << *(params->numObjects); + stream << " memObjects = " << *(params->memObjects); + stream << " numEventsInWaitList = " << *(params->numEventsInWaitList); + stream << " eventWaitList = " << *(params->eventWaitList); + stream << " event = " << *(params->event); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clEnqueueAcquireGLObjectsOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + cl_int* error = reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(error != nullptr); + + stream << " -> " << GetErrorString(*error); + stream << " (" << *error << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clGetKernelWorkGroupInfoOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clGetKernelWorkGroupInfo* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " kernel = " << *(params->kernel); + stream << " device = " << *(params->device); + stream << " paramName = " << *(params->paramName); + stream << " paramValueSize = " << *(params->paramValueSize); + stream << " paramValue = " << *(params->paramValue); + stream << " paramValueSizeRet = " << *(params->paramValueSizeRet); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clGetKernelWorkGroupInfoOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + cl_int* error = reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(error != nullptr); + + stream << " -> " << GetErrorString(*error); + stream << " (" << *error << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clCreateImage2DOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clCreateImage2D* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " context = " << *(params->context); + stream << " flags = " << *(params->flags); + stream << " imageFormat = " << *(params->imageFormat); + stream << " imageWidth = " << *(params->imageWidth); + stream << " imageHeight = " << *(params->imageHeight); + stream << " imageRowPitch = " << *(params->imageRowPitch); + stream << " hostPtr = " << *(params->hostPtr); + stream << " errcodeRet = " << *(params->errcodeRet); + stream << std::endl; + + std::cerr << stream.str(); + + if (*(params->errcodeRet) == nullptr) { + *(params->errcodeRet) = ¤t_error; + } +} + +static void clCreateImage2DOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + const cl_params_clCreateImage2D* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + cl_mem* result = + reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(result != nullptr); + stream << " result = " << *result; + + PTI_ASSERT(*(params->errcodeRet) != nullptr); + stream << " -> " << GetErrorString(**(params->errcodeRet)); + stream << " (" << **(params->errcodeRet) << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clCreateContextFromTypeOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clCreateContextFromType* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " properties = " << *(params->properties); + stream << " deviceType = " << *(params->deviceType); + stream << " funcNotify = " << *(params->funcNotify); + stream << " userData = " << *(params->userData); + stream << " errcodeRet = " << *(params->errcodeRet); + stream << std::endl; + + std::cerr << stream.str(); + + if (*(params->errcodeRet) == nullptr) { + *(params->errcodeRet) = ¤t_error; + } +} + +static void clCreateContextFromTypeOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + const cl_params_clCreateContextFromType* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + cl_context* result = + reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(result != nullptr); + stream << " result = " << *result; + + PTI_ASSERT(*(params->errcodeRet) != nullptr); + stream << " -> " << GetErrorString(**(params->errcodeRet)); + stream << " (" << **(params->errcodeRet) << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clRetainProgramOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clRetainProgram* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " program = " << *(params->program); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clRetainProgramOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + cl_int* error = reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(error != nullptr); + + stream << " -> " << GetErrorString(*error); + stream << " (" << *error << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clCreateProgramWithSourceOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clCreateProgramWithSource* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " context = " << *(params->context); + stream << " count = " << *(params->count); + stream << " strings = " << *(params->strings); + stream << " lengths = " << *(params->lengths); + stream << " errcodeRet = " << *(params->errcodeRet); + stream << std::endl; + + std::cerr << stream.str(); + + if (*(params->errcodeRet) == nullptr) { + *(params->errcodeRet) = ¤t_error; + } +} + +static void clCreateProgramWithSourceOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + const cl_params_clCreateProgramWithSource* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + cl_program* result = + reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(result != nullptr); + stream << " result = " << *result; + + PTI_ASSERT(*(params->errcodeRet) != nullptr); + stream << " -> " << GetErrorString(**(params->errcodeRet)); + stream << " (" << **(params->errcodeRet) << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clGetMemObjectInfoOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clGetMemObjectInfo* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " memobj = " << *(params->memobj); + stream << " paramName = " << *(params->paramName); + stream << " paramValueSize = " << *(params->paramValueSize); + stream << " paramValue = " << *(params->paramValue); + stream << " paramValueSizeRet = " << *(params->paramValueSizeRet); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clGetMemObjectInfoOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + cl_int* error = reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(error != nullptr); + + stream << " -> " << GetErrorString(*error); + stream << " (" << *error << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clLinkProgramOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clLinkProgram* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " context = " << *(params->context); + stream << " numDevices = " << *(params->numDevices); + stream << " deviceList = " << *(params->deviceList); + if (*(params->options) == nullptr) { + stream << " options = " << "0"; + } else if (strlen(*(params->options)) == 0) { + stream << " options = \"\""; + } else { + stream << " options = \"" << *(params->options) << "\""; + } + stream << " numInputPrograms = " << *(params->numInputPrograms); + stream << " inputPrograms = " << *(params->inputPrograms); + stream << " funcNotify = " << *(params->funcNotify); + stream << " userData = " << *(params->userData); + stream << " errcodeRet = " << *(params->errcodeRet); + stream << std::endl; + + std::cerr << stream.str(); + + if (*(params->errcodeRet) == nullptr) { + *(params->errcodeRet) = ¤t_error; + } +} + +static void clLinkProgramOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + const cl_params_clLinkProgram* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + cl_program* result = + reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(result != nullptr); + stream << " result = " << *result; + + PTI_ASSERT(*(params->errcodeRet) != nullptr); + stream << " -> " << GetErrorString(**(params->errcodeRet)); + stream << " (" << **(params->errcodeRet) << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clCreateSamplerWithPropertiesOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clCreateSamplerWithProperties* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " context = " << *(params->context); + stream << " samplerProperties = " << *(params->samplerProperties); + stream << " errcodeRet = " << *(params->errcodeRet); + stream << std::endl; + + std::cerr << stream.str(); + + if (*(params->errcodeRet) == nullptr) { + *(params->errcodeRet) = ¤t_error; + } +} + +static void clCreateSamplerWithPropertiesOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + const cl_params_clCreateSamplerWithProperties* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + cl_sampler* result = + reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(result != nullptr); + stream << " result = " << *result; + + PTI_ASSERT(*(params->errcodeRet) != nullptr); + stream << " -> " << GetErrorString(**(params->errcodeRet)); + stream << " (" << **(params->errcodeRet) << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clRetainSamplerOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clRetainSampler* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " sampler = " << *(params->sampler); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clRetainSamplerOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + cl_int* error = reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(error != nullptr); + + stream << " -> " << GetErrorString(*error); + stream << " (" << *error << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clCreateFromGLTexture3DOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clCreateFromGLTexture3D* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " context = " << *(params->context); + stream << " flags = " << *(params->flags); + stream << " target = " << *(params->target); + stream << " miplevel = " << *(params->miplevel); + stream << " texture = " << *(params->texture); + stream << " errcodeRet = " << *(params->errcodeRet); + stream << std::endl; + + std::cerr << stream.str(); + + if (*(params->errcodeRet) == nullptr) { + *(params->errcodeRet) = ¤t_error; + } +} + +static void clCreateFromGLTexture3DOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + const cl_params_clCreateFromGLTexture3D* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + cl_mem* result = + reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(result != nullptr); + stream << " result = " << *result; + + PTI_ASSERT(*(params->errcodeRet) != nullptr); + stream << " -> " << GetErrorString(**(params->errcodeRet)); + stream << " (" << **(params->errcodeRet) << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clEnqueueMapImageOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clEnqueueMapImage* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " commandQueue = " << *(params->commandQueue); + stream << " image = " << *(params->image); + stream << " blockingMap = " << *(params->blockingMap); + stream << " mapFlags = " << *(params->mapFlags); + stream << " origin = " << *(params->origin); + stream << " region = " << *(params->region); + stream << " imageRowPitch = " << *(params->imageRowPitch); + stream << " imageSlicePitch = " << *(params->imageSlicePitch); + stream << " numEventsInWaitList = " << *(params->numEventsInWaitList); + stream << " eventWaitList = " << *(params->eventWaitList); + stream << " event = " << *(params->event); + stream << " errcodeRet = " << *(params->errcodeRet); + stream << std::endl; + + std::cerr << stream.str(); + + if (*(params->errcodeRet) == nullptr) { + *(params->errcodeRet) = ¤t_error; + } +} + +static void clEnqueueMapImageOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + const cl_params_clEnqueueMapImage* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + void ** result = + reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(result != nullptr); + stream << " result = " << *result; + + PTI_ASSERT(*(params->errcodeRet) != nullptr); + stream << " -> " << GetErrorString(**(params->errcodeRet)); + stream << " (" << **(params->errcodeRet) << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clEnqueueWriteBufferOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clEnqueueWriteBuffer* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " commandQueue = " << *(params->commandQueue); + stream << " buffer = " << *(params->buffer); + stream << " blockingWrite = " << *(params->blockingWrite); + stream << " offset = " << *(params->offset); + stream << " cb = " << *(params->cb); + stream << " ptr = " << *(params->ptr); + stream << " numEventsInWaitList = " << *(params->numEventsInWaitList); + stream << " eventWaitList = " << *(params->eventWaitList); + stream << " event = " << *(params->event); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clEnqueueWriteBufferOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + cl_int* error = reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(error != nullptr); + + stream << " -> " << GetErrorString(*error); + stream << " (" << *error << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clEnqueueCopyImageOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clEnqueueCopyImage* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " commandQueue = " << *(params->commandQueue); + stream << " srcImage = " << *(params->srcImage); + stream << " dstImage = " << *(params->dstImage); + stream << " srcOrigin = " << *(params->srcOrigin); + stream << " dstOrigin = " << *(params->dstOrigin); + stream << " region = " << *(params->region); + stream << " numEventsInWaitList = " << *(params->numEventsInWaitList); + stream << " eventWaitList = " << *(params->eventWaitList); + stream << " event = " << *(params->event); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clEnqueueCopyImageOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + cl_int* error = reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(error != nullptr); + + stream << " -> " << GetErrorString(*error); + stream << " (" << *error << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clGetExtensionFunctionAddressOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clGetExtensionFunctionAddress* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + if (*(params->funcName) == nullptr) { + stream << " funcName = " << "0"; + } else if (strlen(*(params->funcName)) == 0) { + stream << " funcName = \"\""; + } else { + stream << " funcName = \"" << *(params->funcName) << "\""; + } + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clGetExtensionFunctionAddressOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + void** result = + reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(result != nullptr); + stream << " result = " << *result; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clEnqueueReadBufferRectOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clEnqueueReadBufferRect* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " commandQueue = " << *(params->commandQueue); + stream << " buffer = " << *(params->buffer); + stream << " blockingRead = " << *(params->blockingRead); + stream << " bufferOrigin = " << *(params->bufferOrigin); + stream << " hostOrigin = " << *(params->hostOrigin); + stream << " region = " << *(params->region); + stream << " bufferRowPitch = " << *(params->bufferRowPitch); + stream << " bufferSlicePitch = " << *(params->bufferSlicePitch); + stream << " hostRowPitch = " << *(params->hostRowPitch); + stream << " hostSlicePitch = " << *(params->hostSlicePitch); + stream << " ptr = " << *(params->ptr); + stream << " numEventsInWaitList = " << *(params->numEventsInWaitList); + stream << " eventWaitList = " << *(params->eventWaitList); + stream << " event = " << *(params->event); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clEnqueueReadBufferRectOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + cl_int* error = reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(error != nullptr); + + stream << " -> " << GetErrorString(*error); + stream << " (" << *error << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clCreateSubDevicesOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clCreateSubDevices* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " inDevice = " << *(params->inDevice); + stream << " properties = " << *(params->properties); + stream << " numDevices = " << *(params->numDevices); + stream << " outDevices = " << *(params->outDevices); + stream << " numDevicesRet = " << *(params->numDevicesRet); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clCreateSubDevicesOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + cl_int* error = reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(error != nullptr); + + stream << " -> " << GetErrorString(*error); + stream << " (" << *error << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clGetDeviceAndHostTimerOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clGetDeviceAndHostTimer* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " device = " << *(params->device); + stream << " deviceTimestamp = " << *(params->deviceTimestamp); + stream << " hostTimestamp = " << *(params->hostTimestamp); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clGetDeviceAndHostTimerOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + cl_int* error = reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(error != nullptr); + + stream << " -> " << GetErrorString(*error); + stream << " (" << *error << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clReleaseSamplerOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clReleaseSampler* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " sampler = " << *(params->sampler); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clReleaseSamplerOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + cl_int* error = reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(error != nullptr); + + stream << " -> " << GetErrorString(*error); + stream << " (" << *error << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clEnqueueTaskOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clEnqueueTask* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " commandQueue = " << *(params->commandQueue); + stream << " kernel = " << *(params->kernel); + stream << " numEventsInWaitList = " << *(params->numEventsInWaitList); + stream << " eventWaitList = " << *(params->eventWaitList); + stream << " event = " << *(params->event); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clEnqueueTaskOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + cl_int* error = reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(error != nullptr); + + stream << " -> " << GetErrorString(*error); + stream << " (" << *error << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clFinishOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clFinish* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " commandQueue = " << *(params->commandQueue); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clFinishOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + cl_int* error = reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(error != nullptr); + + stream << " -> " << GetErrorString(*error); + stream << " (" << *error << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clGetEventInfoOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clGetEventInfo* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " event = " << *(params->event); + stream << " paramName = " << *(params->paramName); + stream << " paramValueSize = " << *(params->paramValueSize); + stream << " paramValue = " << *(params->paramValue); + stream << " paramValueSizeRet = " << *(params->paramValueSizeRet); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clGetEventInfoOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + cl_int* error = reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(error != nullptr); + + stream << " -> " << GetErrorString(*error); + stream << " (" << *error << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clGetEventProfilingInfoOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clGetEventProfilingInfo* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " event = " << *(params->event); + stream << " paramName = " << *(params->paramName); + stream << " paramValueSize = " << *(params->paramValueSize); + stream << " paramValue = " << *(params->paramValue); + stream << " paramValueSizeRet = " << *(params->paramValueSizeRet); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clGetEventProfilingInfoOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + cl_int* error = reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(error != nullptr); + + stream << " -> " << GetErrorString(*error); + stream << " (" << *error << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clSetKernelArgSVMPointerOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clSetKernelArgSVMPointer* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " kernel = " << *(params->kernel); + stream << " argIndex = " << *(params->argIndex); + stream << " argValue = " << *(params->argValue); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clSetKernelArgSVMPointerOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + cl_int* error = reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(error != nullptr); + + stream << " -> " << GetErrorString(*error); + stream << " (" << *error << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clCreateImageOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clCreateImage* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " context = " << *(params->context); + stream << " flags = " << *(params->flags); + stream << " imageFormat = " << *(params->imageFormat); + stream << " imageDesc = " << *(params->imageDesc); + stream << " hostPtr = " << *(params->hostPtr); + stream << " errcodeRet = " << *(params->errcodeRet); + stream << std::endl; + + std::cerr << stream.str(); + + if (*(params->errcodeRet) == nullptr) { + *(params->errcodeRet) = ¤t_error; + } +} + +static void clCreateImageOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + const cl_params_clCreateImage* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + cl_mem* result = + reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(result != nullptr); + stream << " result = " << *result; + + PTI_ASSERT(*(params->errcodeRet) != nullptr); + stream << " -> " << GetErrorString(**(params->errcodeRet)); + stream << " (" << **(params->errcodeRet) << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clEnqueueSVMMemcpyOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clEnqueueSVMMemcpy* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " commandQueue = " << *(params->commandQueue); + stream << " blockingCopy = " << *(params->blockingCopy); + stream << " dstPtr = " << *(params->dstPtr); + stream << " srcPtr = " << *(params->srcPtr); + stream << " size = " << *(params->size); + stream << " numEventsInWaitList = " << *(params->numEventsInWaitList); + stream << " eventWaitList = " << *(params->eventWaitList); + stream << " event = " << *(params->event); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clEnqueueSVMMemcpyOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + cl_int* error = reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(error != nullptr); + + stream << " -> " << GetErrorString(*error); + stream << " (" << *error << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clReleaseKernelOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clReleaseKernel* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " kernel = " << *(params->kernel); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clReleaseKernelOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + cl_int* error = reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(error != nullptr); + + stream << " -> " << GetErrorString(*error); + stream << " (" << *error << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clEnqueueNativeKernelOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clEnqueueNativeKernel* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " commandQueue = " << *(params->commandQueue); + stream << " userFunc = " << *(params->userFunc); + stream << " args = " << *(params->args); + stream << " cbArgs = " << *(params->cbArgs); + stream << " numMemObjects = " << *(params->numMemObjects); + stream << " memList = " << *(params->memList); + stream << " argsMemLoc = " << *(params->argsMemLoc); + stream << " numEventsInWaitList = " << *(params->numEventsInWaitList); + stream << " eventWaitList = " << *(params->eventWaitList); + stream << " event = " << *(params->event); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clEnqueueNativeKernelOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + cl_int* error = reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(error != nullptr); + + stream << " -> " << GetErrorString(*error); + stream << " (" << *error << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clCreateKernelsInProgramOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clCreateKernelsInProgram* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " program = " << *(params->program); + stream << " numKernels = " << *(params->numKernels); + stream << " kernels = " << *(params->kernels); + stream << " numKernelsRet = " << *(params->numKernelsRet); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clCreateKernelsInProgramOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + cl_int* error = reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(error != nullptr); + + stream << " -> " << GetErrorString(*error); + stream << " (" << *error << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clSetCommandQueuePropertyOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clSetCommandQueueProperty* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " commandQueue = " << *(params->commandQueue); + stream << " properties = " << *(params->properties); + stream << " enable = " << *(params->enable); + stream << " oldProperties = " << *(params->oldProperties); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clSetCommandQueuePropertyOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + cl_int* error = reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(error != nullptr); + + stream << " -> " << GetErrorString(*error); + stream << " (" << *error << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clGetDeviceInfoOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clGetDeviceInfo* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " device = " << *(params->device); + stream << " paramName = " << *(params->paramName); + stream << " paramValueSize = " << *(params->paramValueSize); + stream << " paramValue = " << *(params->paramValue); + stream << " paramValueSizeRet = " << *(params->paramValueSizeRet); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clGetDeviceInfoOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + cl_int* error = reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(error != nullptr); + + stream << " -> " << GetErrorString(*error); + stream << " (" << *error << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clEnqueueNDRangeKernelOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clEnqueueNDRangeKernel* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " commandQueue = " << *(params->commandQueue); + stream << " kernel = " << *(params->kernel); + stream << " workDim = " << *(params->workDim); + stream << " globalWorkOffset = " << *(params->globalWorkOffset); + stream << " globalWorkSize = " << *(params->globalWorkSize); + stream << " localWorkSize = " << *(params->localWorkSize); + stream << " numEventsInWaitList = " << *(params->numEventsInWaitList); + stream << " eventWaitList = " << *(params->eventWaitList); + stream << " event = " << *(params->event); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clEnqueueNDRangeKernelOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + cl_int* error = reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(error != nullptr); + + stream << " -> " << GetErrorString(*error); + stream << " (" << *error << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clReleaseProgramOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clReleaseProgram* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " program = " << *(params->program); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clReleaseProgramOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + cl_int* error = reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(error != nullptr); + + stream << " -> " << GetErrorString(*error); + stream << " (" << *error << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clCreateFromGLBufferOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clCreateFromGLBuffer* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " context = " << *(params->context); + stream << " flags = " << *(params->flags); + stream << " bufobj = " << *(params->bufobj); + stream << " errcodeRet = " << *(params->errcodeRet); + stream << std::endl; + + std::cerr << stream.str(); + + if (*(params->errcodeRet) == nullptr) { + *(params->errcodeRet) = ¤t_error; + } +} + +static void clCreateFromGLBufferOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + const cl_params_clCreateFromGLBuffer* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + cl_mem* result = + reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(result != nullptr); + stream << " result = " << *result; + + PTI_ASSERT(*(params->errcodeRet) != nullptr); + stream << " -> " << GetErrorString(**(params->errcodeRet)); + stream << " (" << **(params->errcodeRet) << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clGetGLTextureInfoOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clGetGLTextureInfo* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " memobj = " << *(params->memobj); + stream << " paramName = " << *(params->paramName); + stream << " paramValueSize = " << *(params->paramValueSize); + stream << " paramValue = " << *(params->paramValue); + stream << " paramValueSizeRet = " << *(params->paramValueSizeRet); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clGetGLTextureInfoOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + cl_int* error = reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(error != nullptr); + + stream << " -> " << GetErrorString(*error); + stream << " (" << *error << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clSetDefaultDeviceCommandQueueOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clSetDefaultDeviceCommandQueue* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " context = " << *(params->context); + stream << " device = " << *(params->device); + stream << " commandQueue = " << *(params->commandQueue); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clSetDefaultDeviceCommandQueueOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + cl_int* error = reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(error != nullptr); + + stream << " -> " << GetErrorString(*error); + stream << " (" << *error << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clCreatePipeOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clCreatePipe* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " context = " << *(params->context); + stream << " flags = " << *(params->flags); + stream << " pipePacketSize = " << *(params->pipePacketSize); + stream << " pipeMaxPackets = " << *(params->pipeMaxPackets); + stream << " properties = " << *(params->properties); + stream << " errcodeRet = " << *(params->errcodeRet); + stream << std::endl; + + std::cerr << stream.str(); + + if (*(params->errcodeRet) == nullptr) { + *(params->errcodeRet) = ¤t_error; + } +} + +static void clCreatePipeOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + const cl_params_clCreatePipe* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + cl_mem* result = + reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(result != nullptr); + stream << " result = " << *result; + + PTI_ASSERT(*(params->errcodeRet) != nullptr); + stream << " -> " << GetErrorString(**(params->errcodeRet)); + stream << " (" << **(params->errcodeRet) << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clGetPlatformInfoOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clGetPlatformInfo* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " platform = " << *(params->platform); + stream << " paramName = " << *(params->paramName); + stream << " paramValueSize = " << *(params->paramValueSize); + stream << " paramValue = " << *(params->paramValue); + stream << " paramValueSizeRet = " << *(params->paramValueSizeRet); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clGetPlatformInfoOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + cl_int* error = reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(error != nullptr); + + stream << " -> " << GetErrorString(*error); + stream << " (" << *error << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clEnqueueReadBufferOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clEnqueueReadBuffer* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " commandQueue = " << *(params->commandQueue); + stream << " buffer = " << *(params->buffer); + stream << " blockingRead = " << *(params->blockingRead); + stream << " offset = " << *(params->offset); + stream << " cb = " << *(params->cb); + stream << " ptr = " << *(params->ptr); + stream << " numEventsInWaitList = " << *(params->numEventsInWaitList); + stream << " eventWaitList = " << *(params->eventWaitList); + stream << " event = " << *(params->event); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clEnqueueReadBufferOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + cl_int* error = reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(error != nullptr); + + stream << " -> " << GetErrorString(*error); + stream << " (" << *error << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clSetMemObjectDestructorCallbackOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clSetMemObjectDestructorCallback* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " memobj = " << *(params->memobj); + stream << " funcNotify = " << *(params->funcNotify); + stream << " userData = " << *(params->userData); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clSetMemObjectDestructorCallbackOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + cl_int* error = reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(error != nullptr); + + stream << " -> " << GetErrorString(*error); + stream << " (" << *error << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clGetKernelSubGroupInfoOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clGetKernelSubGroupInfo* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " kernel = " << *(params->kernel); + stream << " device = " << *(params->device); + stream << " paramName = " << *(params->paramName); + stream << " inputValueSize = " << *(params->inputValueSize); + stream << " inputValue = " << *(params->inputValue); + stream << " paramValueSize = " << *(params->paramValueSize); + stream << " paramValue = " << *(params->paramValue); + stream << " paramValueSizeRet = " << *(params->paramValueSizeRet); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clGetKernelSubGroupInfoOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + cl_int* error = reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(error != nullptr); + + stream << " -> " << GetErrorString(*error); + stream << " (" << *error << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clEnqueueCopyBufferRectOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clEnqueueCopyBufferRect* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " commandQueue = " << *(params->commandQueue); + stream << " srcBuffer = " << *(params->srcBuffer); + stream << " dstBuffer = " << *(params->dstBuffer); + stream << " srcOrigin = " << *(params->srcOrigin); + stream << " dstOrigin = " << *(params->dstOrigin); + stream << " region = " << *(params->region); + stream << " srcRowPitch = " << *(params->srcRowPitch); + stream << " srcSlicePitch = " << *(params->srcSlicePitch); + stream << " dstRowPitch = " << *(params->dstRowPitch); + stream << " dstSlicePitch = " << *(params->dstSlicePitch); + stream << " numEventsInWaitList = " << *(params->numEventsInWaitList); + stream << " eventWaitList = " << *(params->eventWaitList); + stream << " event = " << *(params->event); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clEnqueueCopyBufferRectOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + cl_int* error = reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(error != nullptr); + + stream << " -> " << GetErrorString(*error); + stream << " (" << *error << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clWaitForEventsOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clWaitForEvents* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " numEvents = " << *(params->numEvents); + stream << " eventList = " << *(params->eventList); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clWaitForEventsOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + cl_int* error = reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(error != nullptr); + + stream << " -> " << GetErrorString(*error); + stream << " (" << *error << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clEnqueueSVMMigrateMemOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clEnqueueSVMMigrateMem* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " commandQueue = " << *(params->commandQueue); + stream << " numSvmPointers = " << *(params->numSvmPointers); + stream << " svmPointers = " << *(params->svmPointers); + stream << " sizes = " << *(params->sizes); + stream << " flags = " << *(params->flags); + stream << " numEventsInWaitList = " << *(params->numEventsInWaitList); + stream << " eventWaitList = " << *(params->eventWaitList); + stream << " event = " << *(params->event); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clEnqueueSVMMigrateMemOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + cl_int* error = reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(error != nullptr); + + stream << " -> " << GetErrorString(*error); + stream << " (" << *error << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clRetainKernelOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clRetainKernel* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " kernel = " << *(params->kernel); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clRetainKernelOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + cl_int* error = reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(error != nullptr); + + stream << " -> " << GetErrorString(*error); + stream << " (" << *error << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clCreateCommandQueueWithPropertiesOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clCreateCommandQueueWithProperties* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " context = " << *(params->context); + stream << " device = " << *(params->device); + stream << " properties = " << *(params->properties); + stream << " errcodeRet = " << *(params->errcodeRet); + stream << std::endl; + + std::cerr << stream.str(); + + if (*(params->errcodeRet) == nullptr) { + *(params->errcodeRet) = ¤t_error; + } +} + +static void clCreateCommandQueueWithPropertiesOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + const cl_params_clCreateCommandQueueWithProperties* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + cl_command_queue* result = + reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(result != nullptr); + stream << " result = " << *result; + + PTI_ASSERT(*(params->errcodeRet) != nullptr); + stream << " -> " << GetErrorString(**(params->errcodeRet)); + stream << " (" << **(params->errcodeRet) << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clCreateProgramWithBuiltInKernelsOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clCreateProgramWithBuiltInKernels* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " context = " << *(params->context); + stream << " numDevices = " << *(params->numDevices); + stream << " deviceList = " << *(params->deviceList); + if (*(params->kernelNames) == nullptr) { + stream << " kernelNames = " << "0"; + } else if (strlen(*(params->kernelNames)) == 0) { + stream << " kernelNames = \"\""; + } else { + stream << " kernelNames = \"" << *(params->kernelNames) << "\""; + } + stream << " errcodeRet = " << *(params->errcodeRet); + stream << std::endl; + + std::cerr << stream.str(); + + if (*(params->errcodeRet) == nullptr) { + *(params->errcodeRet) = ¤t_error; + } +} + +static void clCreateProgramWithBuiltInKernelsOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + const cl_params_clCreateProgramWithBuiltInKernels* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + cl_program* result = + reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(result != nullptr); + stream << " result = " << *result; + + PTI_ASSERT(*(params->errcodeRet) != nullptr); + stream << " -> " << GetErrorString(**(params->errcodeRet)); + stream << " (" << **(params->errcodeRet) << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clCreateBufferOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clCreateBuffer* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " context = " << *(params->context); + stream << " flags = " << *(params->flags); + stream << " size = " << *(params->size); + stream << " hostPtr = " << *(params->hostPtr); + stream << " errcodeRet = " << *(params->errcodeRet); + stream << std::endl; + + std::cerr << stream.str(); + + if (*(params->errcodeRet) == nullptr) { + *(params->errcodeRet) = ¤t_error; + } +} + +static void clCreateBufferOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + const cl_params_clCreateBuffer* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + cl_mem* result = + reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(result != nullptr); + stream << " result = " << *result; + + PTI_ASSERT(*(params->errcodeRet) != nullptr); + stream << " -> " << GetErrorString(**(params->errcodeRet)); + stream << " (" << **(params->errcodeRet) << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clGetProgramBuildInfoOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clGetProgramBuildInfo* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " program = " << *(params->program); + stream << " device = " << *(params->device); + stream << " paramName = " << *(params->paramName); + stream << " paramValueSize = " << *(params->paramValueSize); + stream << " paramValue = " << *(params->paramValue); + stream << " paramValueSizeRet = " << *(params->paramValueSizeRet); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clGetProgramBuildInfoOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + cl_int* error = reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(error != nullptr); + + stream << " -> " << GetErrorString(*error); + stream << " (" << *error << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clEnqueueFillBufferOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clEnqueueFillBuffer* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " commandQueue = " << *(params->commandQueue); + stream << " buffer = " << *(params->buffer); + stream << " pattern = " << *(params->pattern); + stream << " patternSize = " << *(params->patternSize); + stream << " offset = " << *(params->offset); + stream << " size = " << *(params->size); + stream << " numEventsInWaitList = " << *(params->numEventsInWaitList); + stream << " eventWaitList = " << *(params->eventWaitList); + stream << " event = " << *(params->event); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clEnqueueFillBufferOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + cl_int* error = reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(error != nullptr); + + stream << " -> " << GetErrorString(*error); + stream << " (" << *error << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clEnqueueReadImageOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clEnqueueReadImage* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " commandQueue = " << *(params->commandQueue); + stream << " image = " << *(params->image); + stream << " blockingRead = " << *(params->blockingRead); + stream << " origin = " << *(params->origin); + stream << " region = " << *(params->region); + stream << " rowPitch = " << *(params->rowPitch); + stream << " slicePitch = " << *(params->slicePitch); + stream << " ptr = " << *(params->ptr); + stream << " numEventsInWaitList = " << *(params->numEventsInWaitList); + stream << " eventWaitList = " << *(params->eventWaitList); + stream << " event = " << *(params->event); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clEnqueueReadImageOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + cl_int* error = reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(error != nullptr); + + stream << " -> " << GetErrorString(*error); + stream << " (" << *error << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clEnqueueWriteBufferRectOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clEnqueueWriteBufferRect* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " commandQueue = " << *(params->commandQueue); + stream << " buffer = " << *(params->buffer); + stream << " blockingWrite = " << *(params->blockingWrite); + stream << " bufferOrigin = " << *(params->bufferOrigin); + stream << " hostOrigin = " << *(params->hostOrigin); + stream << " region = " << *(params->region); + stream << " bufferRowPitch = " << *(params->bufferRowPitch); + stream << " bufferSlicePitch = " << *(params->bufferSlicePitch); + stream << " hostRowPitch = " << *(params->hostRowPitch); + stream << " hostSlicePitch = " << *(params->hostSlicePitch); + stream << " ptr = " << *(params->ptr); + stream << " numEventsInWaitList = " << *(params->numEventsInWaitList); + stream << " eventWaitList = " << *(params->eventWaitList); + stream << " event = " << *(params->event); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clEnqueueWriteBufferRectOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + cl_int* error = reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(error != nullptr); + + stream << " -> " << GetErrorString(*error); + stream << " (" << *error << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clEnqueueCopyBufferToImageOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clEnqueueCopyBufferToImage* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " commandQueue = " << *(params->commandQueue); + stream << " srcBuffer = " << *(params->srcBuffer); + stream << " dstImage = " << *(params->dstImage); + stream << " srcOffset = " << *(params->srcOffset); + stream << " dstOrigin = " << *(params->dstOrigin); + stream << " region = " << *(params->region); + stream << " numEventsInWaitList = " << *(params->numEventsInWaitList); + stream << " eventWaitList = " << *(params->eventWaitList); + stream << " event = " << *(params->event); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clEnqueueCopyBufferToImageOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + cl_int* error = reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(error != nullptr); + + stream << " -> " << GetErrorString(*error); + stream << " (" << *error << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clGetExtensionFunctionAddressForPlatformOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clGetExtensionFunctionAddressForPlatform* params = + reinterpret_cast< + const cl_params_clGetExtensionFunctionAddressForPlatform*>( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " platform = " << *(params->platform); + if (*(params->funcName) == nullptr) { + stream << " funcName = " << "0"; + } else if (strlen(*(params->funcName)) == 0) { + stream << " funcName = \"\""; + } else { + stream << " funcName = \"" << *(params->funcName) << "\""; + } + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clGetExtensionFunctionAddressForPlatformOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + void** result = + reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(result != nullptr); + stream << " result = " << *result; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clSetKernelArgOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clSetKernelArg* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " kernel = " << *(params->kernel); + stream << " argIndex = " << *(params->argIndex); + stream << " argSize = " << *(params->argSize); + stream << " argValue = " << *(params->argValue); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clSetKernelArgOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + cl_int* error = reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(error != nullptr); + + stream << " -> " << GetErrorString(*error); + stream << " (" << *error << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clReleaseDeviceOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clReleaseDevice* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " device = " << *(params->device); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clReleaseDeviceOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + cl_int* error = reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(error != nullptr); + + stream << " -> " << GetErrorString(*error); + stream << " (" << *error << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clCreateSubBufferOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clCreateSubBuffer* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " buffer = " << *(params->buffer); + stream << " flags = " << *(params->flags); + stream << " bufferCreateType = " << *(params->bufferCreateType); + stream << " bufferCreateInfo = " << *(params->bufferCreateInfo); + stream << " errcodeRet = " << *(params->errcodeRet); + stream << std::endl; + + std::cerr << stream.str(); + + if (*(params->errcodeRet) == nullptr) { + *(params->errcodeRet) = ¤t_error; + } +} + +static void clCreateSubBufferOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + const cl_params_clCreateSubBuffer* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + cl_mem* result = + reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(result != nullptr); + stream << " result = " << *result; + + PTI_ASSERT(*(params->errcodeRet) != nullptr); + stream << " -> " << GetErrorString(**(params->errcodeRet)); + stream << " (" << **(params->errcodeRet) << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clEnqueueMigrateMemObjectsOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clEnqueueMigrateMemObjects* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " commandQueue = " << *(params->commandQueue); + stream << " numMemObjects = " << *(params->numMemObjects); + stream << " memObjects = " << *(params->memObjects); + stream << " flags = " << *(params->flags); + stream << " numEventsInWaitList = " << *(params->numEventsInWaitList); + stream << " eventWaitList = " << *(params->eventWaitList); + stream << " event = " << *(params->event); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clEnqueueMigrateMemObjectsOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + cl_int* error = reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(error != nullptr); + + stream << " -> " << GetErrorString(*error); + stream << " (" << *error << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clCreateCommandQueueOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clCreateCommandQueue* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " context = " << *(params->context); + stream << " device = " << *(params->device); + stream << " properties = " << *(params->properties); + stream << " errcodeRet = " << *(params->errcodeRet); + stream << std::endl; + + std::cerr << stream.str(); + + if (*(params->errcodeRet) == nullptr) { + *(params->errcodeRet) = ¤t_error; + } +} + +static void clCreateCommandQueueOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + const cl_params_clCreateCommandQueue* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + cl_command_queue* result = + reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(result != nullptr); + stream << " result = " << *result; + + PTI_ASSERT(*(params->errcodeRet) != nullptr); + stream << " -> " << GetErrorString(**(params->errcodeRet)); + stream << " (" << **(params->errcodeRet) << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clEnqueueSVMMemFillOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clEnqueueSVMMemFill* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " commandQueue = " << *(params->commandQueue); + stream << " svmPtr = " << *(params->svmPtr); + stream << " pattern = " << *(params->pattern); + stream << " patternSize = " << *(params->patternSize); + stream << " size = " << *(params->size); + stream << " numEventsInWaitList = " << *(params->numEventsInWaitList); + stream << " eventWaitList = " << *(params->eventWaitList); + stream << " event = " << *(params->event); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clEnqueueSVMMemFillOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + cl_int* error = reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(error != nullptr); + + stream << " -> " << GetErrorString(*error); + stream << " (" << *error << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clReleaseCommandQueueOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clReleaseCommandQueue* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " commandQueue = " << *(params->commandQueue); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clReleaseCommandQueueOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + cl_int* error = reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(error != nullptr); + + stream << " -> " << GetErrorString(*error); + stream << " (" << *error << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clEnqueueCopyBufferOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clEnqueueCopyBuffer* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " commandQueue = " << *(params->commandQueue); + stream << " srcBuffer = " << *(params->srcBuffer); + stream << " dstBuffer = " << *(params->dstBuffer); + stream << " srcOffset = " << *(params->srcOffset); + stream << " dstOffset = " << *(params->dstOffset); + stream << " cb = " << *(params->cb); + stream << " numEventsInWaitList = " << *(params->numEventsInWaitList); + stream << " eventWaitList = " << *(params->eventWaitList); + stream << " event = " << *(params->event); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clEnqueueCopyBufferOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + cl_int* error = reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(error != nullptr); + + stream << " -> " << GetErrorString(*error); + stream << " (" << *error << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clGetCommandQueueInfoOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clGetCommandQueueInfo* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " commandQueue = " << *(params->commandQueue); + stream << " paramName = " << *(params->paramName); + stream << " paramValueSize = " << *(params->paramValueSize); + stream << " paramValue = " << *(params->paramValue); + stream << " paramValueSizeRet = " << *(params->paramValueSizeRet); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clGetCommandQueueInfoOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + cl_int* error = reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(error != nullptr); + + stream << " -> " << GetErrorString(*error); + stream << " (" << *error << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clBuildProgramOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clBuildProgram* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " program = " << *(params->program); + stream << " numDevices = " << *(params->numDevices); + stream << " deviceList = " << *(params->deviceList); + if (*(params->options) == nullptr) { + stream << " options = " << "0"; + } else if (strlen(*(params->options)) == 0) { + stream << " options = \"\""; + } else { + stream << " options = \"" << *(params->options) << "\""; + } + stream << " funcNotify = " << *(params->funcNotify); + stream << " userData = " << *(params->userData); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clBuildProgramOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + cl_int* error = reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(error != nullptr); + + stream << " -> " << GetErrorString(*error); + stream << " (" << *error << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clRetainContextOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clRetainContext* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " context = " << *(params->context); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clRetainContextOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + cl_int* error = reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(error != nullptr); + + stream << " -> " << GetErrorString(*error); + stream << " (" << *error << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clEnqueueBarrierOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clEnqueueBarrier* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " commandQueue = " << *(params->commandQueue); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clEnqueueBarrierOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + cl_int* error = reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(error != nullptr); + + stream << " -> " << GetErrorString(*error); + stream << " (" << *error << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clRetainDeviceOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clRetainDevice* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " device = " << *(params->device); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clRetainDeviceOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + cl_int* error = reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(error != nullptr); + + stream << " -> " << GetErrorString(*error); + stream << " (" << *error << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clEnqueueSVMMapOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clEnqueueSVMMap* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " commandQueue = " << *(params->commandQueue); + stream << " blockingMap = " << *(params->blockingMap); + stream << " mapFlags = " << *(params->mapFlags); + stream << " svmPtr = " << *(params->svmPtr); + stream << " size = " << *(params->size); + stream << " numEventsInWaitList = " << *(params->numEventsInWaitList); + stream << " eventWaitList = " << *(params->eventWaitList); + stream << " event = " << *(params->event); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clEnqueueSVMMapOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + cl_int* error = reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(error != nullptr); + + stream << " -> " << GetErrorString(*error); + stream << " (" << *error << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clRetainMemObjectOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clRetainMemObject* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " memobj = " << *(params->memobj); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clRetainMemObjectOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + cl_int* error = reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(error != nullptr); + + stream << " -> " << GetErrorString(*error); + stream << " (" << *error << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clSetUserEventStatusOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clSetUserEventStatus* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " event = " << *(params->event); + stream << " executionStatus = " << *(params->executionStatus); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clSetUserEventStatusOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + cl_int* error = reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(error != nullptr); + + stream << " -> " << GetErrorString(*error); + stream << " (" << *error << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clCreateUserEventOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clCreateUserEvent* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " context = " << *(params->context); + stream << " errcodeRet = " << *(params->errcodeRet); + stream << std::endl; + + std::cerr << stream.str(); + + if (*(params->errcodeRet) == nullptr) { + *(params->errcodeRet) = ¤t_error; + } +} + +static void clCreateUserEventOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + const cl_params_clCreateUserEvent* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + cl_event* result = + reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(result != nullptr); + stream << " result = " << *result; + + PTI_ASSERT(*(params->errcodeRet) != nullptr); + stream << " -> " << GetErrorString(**(params->errcodeRet)); + stream << " (" << **(params->errcodeRet) << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clGetSamplerInfoOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clGetSamplerInfo* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " sampler = " << *(params->sampler); + stream << " paramName = " << *(params->paramName); + stream << " paramValueSize = " << *(params->paramValueSize); + stream << " paramValue = " << *(params->paramValue); + stream << " paramValueSizeRet = " << *(params->paramValueSizeRet); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clGetSamplerInfoOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + cl_int* error = reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(error != nullptr); + + stream << " -> " << GetErrorString(*error); + stream << " (" << *error << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clEnqueueMarkerOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clEnqueueMarker* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " commandQueue = " << *(params->commandQueue); + stream << " event = " << *(params->event); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clEnqueueMarkerOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + cl_int* error = reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(error != nullptr); + + stream << " -> " << GetErrorString(*error); + stream << " (" << *error << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clCreateKernelOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clCreateKernel* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " program = " << *(params->program); + if (*(params->kernelName) == nullptr) { + stream << " kernelName = " << "0"; + } else if (strlen(*(params->kernelName)) == 0) { + stream << " kernelName = \"\""; + } else { + stream << " kernelName = \"" << *(params->kernelName) << "\""; + } + stream << " errcodeRet = " << *(params->errcodeRet); + stream << std::endl; + + std::cerr << stream.str(); + + if (*(params->errcodeRet) == nullptr) { + *(params->errcodeRet) = ¤t_error; + } +} + +static void clCreateKernelOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + const cl_params_clCreateKernel* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + cl_kernel* result = + reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(result != nullptr); + stream << " result = " << *result; + + PTI_ASSERT(*(params->errcodeRet) != nullptr); + stream << " -> " << GetErrorString(**(params->errcodeRet)); + stream << " (" << **(params->errcodeRet) << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clGetProgramInfoOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clGetProgramInfo* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " program = " << *(params->program); + stream << " paramName = " << *(params->paramName); + stream << " paramValueSize = " << *(params->paramValueSize); + stream << " paramValue = " << *(params->paramValue); + stream << " paramValueSizeRet = " << *(params->paramValueSizeRet); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clGetProgramInfoOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + cl_int* error = reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(error != nullptr); + + stream << " -> " << GetErrorString(*error); + stream << " (" << *error << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clSVMAllocOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clSVMAlloc* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " context = " << *(params->context); + stream << " flags = " << *(params->flags); + stream << " size = " << *(params->size); + stream << " alignment = " << *(params->alignment); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clSVMAllocOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + void ** result = + reinterpret_cast( + data->functionReturnValue); + PTI_ASSERT(result != nullptr); + stream << " result = " << *result; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clRetainEventOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clRetainEvent* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " event = " << *(params->event); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clRetainEventOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + cl_int* error = reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(error != nullptr); + + stream << " -> " << GetErrorString(*error); + stream << " (" << *error << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clCloneKernelOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clCloneKernel* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " sourceKernel = " << *(params->sourceKernel); + stream << " errcodeRet = " << *(params->errcodeRet); + stream << std::endl; + + std::cerr << stream.str(); + + if (*(params->errcodeRet) == nullptr) { + *(params->errcodeRet) = ¤t_error; + } +} + +static void clCloneKernelOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + const cl_params_clCloneKernel* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + cl_kernel* result = + reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(result != nullptr); + stream << " result = " << *result; + + PTI_ASSERT(*(params->errcodeRet) != nullptr); + stream << " -> " << GetErrorString(**(params->errcodeRet)); + stream << " (" << **(params->errcodeRet) << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clGetImageInfoOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clGetImageInfo* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " image = " << *(params->image); + stream << " paramName = " << *(params->paramName); + stream << " paramValueSize = " << *(params->paramValueSize); + stream << " paramValue = " << *(params->paramValue); + stream << " paramValueSizeRet = " << *(params->paramValueSizeRet); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clGetImageInfoOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + cl_int* error = reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(error != nullptr); + + stream << " -> " << GetErrorString(*error); + stream << " (" << *error << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clFlushOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clFlush* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " commandQueue = " << *(params->commandQueue); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clFlushOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + cl_int* error = reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(error != nullptr); + + stream << " -> " << GetErrorString(*error); + stream << " (" << *error << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clEnqueueMarkerWithWaitListOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clEnqueueMarkerWithWaitList* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " commandQueue = " << *(params->commandQueue); + stream << " numEventsInWaitList = " << *(params->numEventsInWaitList); + stream << " eventWaitList = " << *(params->eventWaitList); + stream << " event = " << *(params->event); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clEnqueueMarkerWithWaitListOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + cl_int* error = reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(error != nullptr); + + stream << " -> " << GetErrorString(*error); + stream << " (" << *error << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clCreateProgramWithILOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clCreateProgramWithIL* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " context = " << *(params->context); + stream << " il = " << *(params->il); + stream << " length = " << *(params->length); + stream << " errcodeRet = " << *(params->errcodeRet); + stream << std::endl; + + std::cerr << stream.str(); + + if (*(params->errcodeRet) == nullptr) { + *(params->errcodeRet) = ¤t_error; + } +} + +static void clCreateProgramWithILOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + const cl_params_clCreateProgramWithIL* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + cl_program* result = + reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(result != nullptr); + stream << " result = " << *result; + + PTI_ASSERT(*(params->errcodeRet) != nullptr); + stream << " -> " << GetErrorString(**(params->errcodeRet)); + stream << " (" << **(params->errcodeRet) << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clCreateSamplerOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clCreateSampler* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " context = " << *(params->context); + stream << " normalizedCoords = " << *(params->normalizedCoords); + stream << " addressingMode = " << *(params->addressingMode); + stream << " filterMode = " << *(params->filterMode); + stream << " errcodeRet = " << *(params->errcodeRet); + stream << std::endl; + + std::cerr << stream.str(); + + if (*(params->errcodeRet) == nullptr) { + *(params->errcodeRet) = ¤t_error; + } +} + +static void clCreateSamplerOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + const cl_params_clCreateSampler* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + cl_sampler* result = + reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(result != nullptr); + stream << " result = " << *result; + + PTI_ASSERT(*(params->errcodeRet) != nullptr); + stream << " -> " << GetErrorString(**(params->errcodeRet)); + stream << " (" << **(params->errcodeRet) << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clCreateFromGLTextureOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clCreateFromGLTexture* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " context = " << *(params->context); + stream << " flags = " << *(params->flags); + stream << " target = " << *(params->target); + stream << " miplevel = " << *(params->miplevel); + stream << " texture = " << *(params->texture); + stream << " errcodeRet = " << *(params->errcodeRet); + stream << std::endl; + + std::cerr << stream.str(); + + if (*(params->errcodeRet) == nullptr) { + *(params->errcodeRet) = ¤t_error; + } +} + +static void clCreateFromGLTextureOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + const cl_params_clCreateFromGLTexture* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + cl_mem* result = + reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(result != nullptr); + stream << " result = " << *result; + + PTI_ASSERT(*(params->errcodeRet) != nullptr); + stream << " -> " << GetErrorString(**(params->errcodeRet)); + stream << " (" << **(params->errcodeRet) << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clSVMFreeOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clSVMFree* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " context = " << *(params->context); + stream << " svmPointer = " << *(params->svmPointer); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clSVMFreeOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clReleaseEventOnEnter( + cl_callback_data* data, uint64_t start) { + const cl_params_clReleaseEvent* params = + reinterpret_cast( + data->functionParams); + PTI_ASSERT(params != nullptr); + + std::stringstream stream; + stream << ">>>> [" << start << "] " << data->functionName << ":"; + + stream << " event = " << *(params->event); + stream << std::endl; + + std::cerr << stream.str(); +} + +static void clReleaseEventOnExit( + cl_callback_data* data, uint64_t start, uint64_t end) { + std::stringstream stream; + stream << "<<<< [" << end << "] " << data->functionName; + stream << " [" << (end - start) << " ns]"; + + cl_int* error = reinterpret_cast(data->functionReturnValue); + PTI_ASSERT(error != nullptr); + + stream << " -> " << GetErrorString(*error); + stream << " (" << *error << ")"; + stream << std::endl; + + std::cerr << stream.str(); +} + +static void OnEnterFunction( + cl_function_id function, cl_callback_data* data, uint64_t start) { + switch (function) { + case CL_FUNCTION_clBuildProgram: + clBuildProgramOnEnter(data, start); + break; + case CL_FUNCTION_clCloneKernel: + clCloneKernelOnEnter(data, start); + break; + case CL_FUNCTION_clCompileProgram: + clCompileProgramOnEnter(data, start); + break; + case CL_FUNCTION_clCreateBuffer: + clCreateBufferOnEnter(data, start); + break; + case CL_FUNCTION_clCreateCommandQueue: + clCreateCommandQueueOnEnter(data, start); + break; + case CL_FUNCTION_clCreateCommandQueueWithProperties: + clCreateCommandQueueWithPropertiesOnEnter(data, start); + break; + case CL_FUNCTION_clCreateContext: + clCreateContextOnEnter(data, start); + break; + case CL_FUNCTION_clCreateContextFromType: + clCreateContextFromTypeOnEnter(data, start); + break; + case CL_FUNCTION_clCreateFromGLBuffer: + clCreateFromGLBufferOnEnter(data, start); + break; + case CL_FUNCTION_clCreateFromGLRenderbuffer: + clCreateFromGLRenderbufferOnEnter(data, start); + break; + case CL_FUNCTION_clCreateFromGLTexture: + clCreateFromGLTextureOnEnter(data, start); + break; + case CL_FUNCTION_clCreateFromGLTexture2D: + clCreateFromGLTexture2DOnEnter(data, start); + break; + case CL_FUNCTION_clCreateFromGLTexture3D: + clCreateFromGLTexture3DOnEnter(data, start); + break; + case CL_FUNCTION_clCreateImage: + clCreateImageOnEnter(data, start); + break; + case CL_FUNCTION_clCreateImage2D: + clCreateImage2DOnEnter(data, start); + break; + case CL_FUNCTION_clCreateImage3D: + clCreateImage3DOnEnter(data, start); + break; + case CL_FUNCTION_clCreateKernel: + clCreateKernelOnEnter(data, start); + break; + case CL_FUNCTION_clCreateKernelsInProgram: + clCreateKernelsInProgramOnEnter(data, start); + break; + case CL_FUNCTION_clCreatePipe: + clCreatePipeOnEnter(data, start); + break; + case CL_FUNCTION_clCreateProgramWithBinary: + clCreateProgramWithBinaryOnEnter(data, start); + break; + case CL_FUNCTION_clCreateProgramWithBuiltInKernels: + clCreateProgramWithBuiltInKernelsOnEnter(data, start); + break; + case CL_FUNCTION_clCreateProgramWithIL: + clCreateProgramWithILOnEnter(data, start); + break; + case CL_FUNCTION_clCreateProgramWithSource: + clCreateProgramWithSourceOnEnter(data, start); + break; + case CL_FUNCTION_clCreateSampler: + clCreateSamplerOnEnter(data, start); + break; + case CL_FUNCTION_clCreateSamplerWithProperties: + clCreateSamplerWithPropertiesOnEnter(data, start); + break; + case CL_FUNCTION_clCreateSubBuffer: + clCreateSubBufferOnEnter(data, start); + break; + case CL_FUNCTION_clCreateSubDevices: + clCreateSubDevicesOnEnter(data, start); + break; + case CL_FUNCTION_clCreateUserEvent: + clCreateUserEventOnEnter(data, start); + break; + case CL_FUNCTION_clEnqueueAcquireGLObjects: + clEnqueueAcquireGLObjectsOnEnter(data, start); + break; + case CL_FUNCTION_clEnqueueBarrier: + clEnqueueBarrierOnEnter(data, start); + break; + case CL_FUNCTION_clEnqueueBarrierWithWaitList: + clEnqueueBarrierWithWaitListOnEnter(data, start); + break; + case CL_FUNCTION_clEnqueueCopyBuffer: + clEnqueueCopyBufferOnEnter(data, start); + break; + case CL_FUNCTION_clEnqueueCopyBufferRect: + clEnqueueCopyBufferRectOnEnter(data, start); + break; + case CL_FUNCTION_clEnqueueCopyBufferToImage: + clEnqueueCopyBufferToImageOnEnter(data, start); + break; + case CL_FUNCTION_clEnqueueCopyImage: + clEnqueueCopyImageOnEnter(data, start); + break; + case CL_FUNCTION_clEnqueueCopyImageToBuffer: + clEnqueueCopyImageToBufferOnEnter(data, start); + break; + case CL_FUNCTION_clEnqueueFillBuffer: + clEnqueueFillBufferOnEnter(data, start); + break; + case CL_FUNCTION_clEnqueueFillImage: + clEnqueueFillImageOnEnter(data, start); + break; + case CL_FUNCTION_clEnqueueMapBuffer: + clEnqueueMapBufferOnEnter(data, start); + break; + case CL_FUNCTION_clEnqueueMapImage: + clEnqueueMapImageOnEnter(data, start); + break; + case CL_FUNCTION_clEnqueueMarker: + clEnqueueMarkerOnEnter(data, start); + break; + case CL_FUNCTION_clEnqueueMarkerWithWaitList: + clEnqueueMarkerWithWaitListOnEnter(data, start); + break; + case CL_FUNCTION_clEnqueueMigrateMemObjects: + clEnqueueMigrateMemObjectsOnEnter(data, start); + break; + case CL_FUNCTION_clEnqueueNDRangeKernel: + clEnqueueNDRangeKernelOnEnter(data, start); + break; + case CL_FUNCTION_clEnqueueNativeKernel: + clEnqueueNativeKernelOnEnter(data, start); + break; + case CL_FUNCTION_clEnqueueReadBuffer: + clEnqueueReadBufferOnEnter(data, start); + break; + case CL_FUNCTION_clEnqueueReadBufferRect: + clEnqueueReadBufferRectOnEnter(data, start); + break; + case CL_FUNCTION_clEnqueueReadImage: + clEnqueueReadImageOnEnter(data, start); + break; + case CL_FUNCTION_clEnqueueReleaseGLObjects: + clEnqueueReleaseGLObjectsOnEnter(data, start); + break; + case CL_FUNCTION_clEnqueueSVMFree: + clEnqueueSVMFreeOnEnter(data, start); + break; + case CL_FUNCTION_clEnqueueSVMMap: + clEnqueueSVMMapOnEnter(data, start); + break; + case CL_FUNCTION_clEnqueueSVMMemFill: + clEnqueueSVMMemFillOnEnter(data, start); + break; + case CL_FUNCTION_clEnqueueSVMMemcpy: + clEnqueueSVMMemcpyOnEnter(data, start); + break; + case CL_FUNCTION_clEnqueueSVMMigrateMem: + clEnqueueSVMMigrateMemOnEnter(data, start); + break; + case CL_FUNCTION_clEnqueueSVMUnmap: + clEnqueueSVMUnmapOnEnter(data, start); + break; + case CL_FUNCTION_clEnqueueTask: + clEnqueueTaskOnEnter(data, start); + break; + case CL_FUNCTION_clEnqueueUnmapMemObject: + clEnqueueUnmapMemObjectOnEnter(data, start); + break; + case CL_FUNCTION_clEnqueueWaitForEvents: + clEnqueueWaitForEventsOnEnter(data, start); + break; + case CL_FUNCTION_clEnqueueWriteBuffer: + clEnqueueWriteBufferOnEnter(data, start); + break; + case CL_FUNCTION_clEnqueueWriteBufferRect: + clEnqueueWriteBufferRectOnEnter(data, start); + break; + case CL_FUNCTION_clEnqueueWriteImage: + clEnqueueWriteImageOnEnter(data, start); + break; + case CL_FUNCTION_clFinish: + clFinishOnEnter(data, start); + break; + case CL_FUNCTION_clFlush: + clFlushOnEnter(data, start); + break; + case CL_FUNCTION_clGetCommandQueueInfo: + clGetCommandQueueInfoOnEnter(data, start); + break; + case CL_FUNCTION_clGetContextInfo: + clGetContextInfoOnEnter(data, start); + break; + case CL_FUNCTION_clGetDeviceAndHostTimer: + clGetDeviceAndHostTimerOnEnter(data, start); + break; + case CL_FUNCTION_clGetDeviceIDs: + clGetDeviceIDsOnEnter(data, start); + break; + case CL_FUNCTION_clGetDeviceInfo: + clGetDeviceInfoOnEnter(data, start); + break; + case CL_FUNCTION_clGetEventInfo: + clGetEventInfoOnEnter(data, start); + break; + case CL_FUNCTION_clGetEventProfilingInfo: + clGetEventProfilingInfoOnEnter(data, start); + break; + case CL_FUNCTION_clGetExtensionFunctionAddress: + clGetExtensionFunctionAddressOnEnter(data, start); + break; + case CL_FUNCTION_clGetExtensionFunctionAddressForPlatform: + clGetExtensionFunctionAddressForPlatformOnEnter(data, start); + break; + case CL_FUNCTION_clGetGLObjectInfo: + clGetGLObjectInfoOnEnter(data, start); + break; + case CL_FUNCTION_clGetGLTextureInfo: + clGetGLTextureInfoOnEnter(data, start); + break; + case CL_FUNCTION_clGetHostTimer: + clGetHostTimerOnEnter(data, start); + break; + case CL_FUNCTION_clGetImageInfo: + clGetImageInfoOnEnter(data, start); + break; + case CL_FUNCTION_clGetKernelArgInfo: + clGetKernelArgInfoOnEnter(data, start); + break; + case CL_FUNCTION_clGetKernelInfo: + clGetKernelInfoOnEnter(data, start); + break; + case CL_FUNCTION_clGetKernelSubGroupInfo: + clGetKernelSubGroupInfoOnEnter(data, start); + break; + case CL_FUNCTION_clGetKernelWorkGroupInfo: + clGetKernelWorkGroupInfoOnEnter(data, start); + break; + case CL_FUNCTION_clGetMemObjectInfo: + clGetMemObjectInfoOnEnter(data, start); + break; + case CL_FUNCTION_clGetPipeInfo: + clGetPipeInfoOnEnter(data, start); + break; + case CL_FUNCTION_clGetPlatformIDs: + clGetPlatformIDsOnEnter(data, start); + break; + case CL_FUNCTION_clGetPlatformInfo: + clGetPlatformInfoOnEnter(data, start); + break; + case CL_FUNCTION_clGetProgramBuildInfo: + clGetProgramBuildInfoOnEnter(data, start); + break; + case CL_FUNCTION_clGetProgramInfo: + clGetProgramInfoOnEnter(data, start); + break; + case CL_FUNCTION_clGetSamplerInfo: + clGetSamplerInfoOnEnter(data, start); + break; + case CL_FUNCTION_clGetSupportedImageFormats: + clGetSupportedImageFormatsOnEnter(data, start); + break; + case CL_FUNCTION_clLinkProgram: + clLinkProgramOnEnter(data, start); + break; + case CL_FUNCTION_clReleaseCommandQueue: + clReleaseCommandQueueOnEnter(data, start); + break; + case CL_FUNCTION_clReleaseContext: + clReleaseContextOnEnter(data, start); + break; + case CL_FUNCTION_clReleaseDevice: + clReleaseDeviceOnEnter(data, start); + break; + case CL_FUNCTION_clReleaseEvent: + clReleaseEventOnEnter(data, start); + break; + case CL_FUNCTION_clReleaseKernel: + clReleaseKernelOnEnter(data, start); + break; + case CL_FUNCTION_clReleaseMemObject: + clReleaseMemObjectOnEnter(data, start); + break; + case CL_FUNCTION_clReleaseProgram: + clReleaseProgramOnEnter(data, start); + break; + case CL_FUNCTION_clReleaseSampler: + clReleaseSamplerOnEnter(data, start); + break; + case CL_FUNCTION_clRetainCommandQueue: + clRetainCommandQueueOnEnter(data, start); + break; + case CL_FUNCTION_clRetainContext: + clRetainContextOnEnter(data, start); + break; + case CL_FUNCTION_clRetainDevice: + clRetainDeviceOnEnter(data, start); + break; + case CL_FUNCTION_clRetainEvent: + clRetainEventOnEnter(data, start); + break; + case CL_FUNCTION_clRetainKernel: + clRetainKernelOnEnter(data, start); + break; + case CL_FUNCTION_clRetainMemObject: + clRetainMemObjectOnEnter(data, start); + break; + case CL_FUNCTION_clRetainProgram: + clRetainProgramOnEnter(data, start); + break; + case CL_FUNCTION_clRetainSampler: + clRetainSamplerOnEnter(data, start); + break; + case CL_FUNCTION_clSVMAlloc: + clSVMAllocOnEnter(data, start); + break; + case CL_FUNCTION_clSVMFree: + clSVMFreeOnEnter(data, start); + break; + case CL_FUNCTION_clSetCommandQueueProperty: + clSetCommandQueuePropertyOnEnter(data, start); + break; + case CL_FUNCTION_clSetDefaultDeviceCommandQueue: + clSetDefaultDeviceCommandQueueOnEnter(data, start); + break; + case CL_FUNCTION_clSetEventCallback: + clSetEventCallbackOnEnter(data, start); + break; + case CL_FUNCTION_clSetKernelArg: + clSetKernelArgOnEnter(data, start); + break; + case CL_FUNCTION_clSetKernelArgSVMPointer: + clSetKernelArgSVMPointerOnEnter(data, start); + break; + case CL_FUNCTION_clSetKernelExecInfo: + clSetKernelExecInfoOnEnter(data, start); + break; + case CL_FUNCTION_clSetMemObjectDestructorCallback: + clSetMemObjectDestructorCallbackOnEnter(data, start); + break; + case CL_FUNCTION_clSetUserEventStatus: + clSetUserEventStatusOnEnter(data, start); + break; + case CL_FUNCTION_clUnloadCompiler: + clUnloadCompilerOnEnter(data, start); + break; + case CL_FUNCTION_clUnloadPlatformCompiler: + clUnloadPlatformCompilerOnEnter(data, start); + break; + case CL_FUNCTION_clWaitForEvents: + clWaitForEventsOnEnter(data, start); + break; + default: + break; + } +} + +static void OnExitFunction( + cl_function_id function, cl_callback_data* data, + uint64_t start, uint64_t end) { + switch (function) { + case CL_FUNCTION_clBuildProgram: + clBuildProgramOnExit(data, start, end); + break; + case CL_FUNCTION_clCloneKernel: + clCloneKernelOnExit(data, start, end); + break; + case CL_FUNCTION_clCompileProgram: + clCompileProgramOnExit(data, start, end); + break; + case CL_FUNCTION_clCreateBuffer: + clCreateBufferOnExit(data, start, end); + break; + case CL_FUNCTION_clCreateCommandQueue: + clCreateCommandQueueOnExit(data, start, end); + break; + case CL_FUNCTION_clCreateCommandQueueWithProperties: + clCreateCommandQueueWithPropertiesOnExit(data, start, end); + break; + case CL_FUNCTION_clCreateContext: + clCreateContextOnExit(data, start, end); + break; + case CL_FUNCTION_clCreateContextFromType: + clCreateContextFromTypeOnExit(data, start, end); + break; + case CL_FUNCTION_clCreateFromGLBuffer: + clCreateFromGLBufferOnExit(data, start, end); + break; + case CL_FUNCTION_clCreateFromGLRenderbuffer: + clCreateFromGLRenderbufferOnExit(data, start, end); + break; + case CL_FUNCTION_clCreateFromGLTexture: + clCreateFromGLTextureOnExit(data, start, end); + break; + case CL_FUNCTION_clCreateFromGLTexture2D: + clCreateFromGLTexture2DOnExit(data, start, end); + break; + case CL_FUNCTION_clCreateFromGLTexture3D: + clCreateFromGLTexture3DOnExit(data, start, end); + break; + case CL_FUNCTION_clCreateImage: + clCreateImageOnExit(data, start, end); + break; + case CL_FUNCTION_clCreateImage2D: + clCreateImage2DOnExit(data, start, end); + break; + case CL_FUNCTION_clCreateImage3D: + clCreateImage3DOnExit(data, start, end); + break; + case CL_FUNCTION_clCreateKernel: + clCreateKernelOnExit(data, start, end); + break; + case CL_FUNCTION_clCreateKernelsInProgram: + clCreateKernelsInProgramOnExit(data, start, end); + break; + case CL_FUNCTION_clCreatePipe: + clCreatePipeOnExit(data, start, end); + break; + case CL_FUNCTION_clCreateProgramWithBinary: + clCreateProgramWithBinaryOnExit(data, start, end); + break; + case CL_FUNCTION_clCreateProgramWithBuiltInKernels: + clCreateProgramWithBuiltInKernelsOnExit(data, start, end); + break; + case CL_FUNCTION_clCreateProgramWithIL: + clCreateProgramWithILOnExit(data, start, end); + break; + case CL_FUNCTION_clCreateProgramWithSource: + clCreateProgramWithSourceOnExit(data, start, end); + break; + case CL_FUNCTION_clCreateSampler: + clCreateSamplerOnExit(data, start, end); + break; + case CL_FUNCTION_clCreateSamplerWithProperties: + clCreateSamplerWithPropertiesOnExit(data, start, end); + break; + case CL_FUNCTION_clCreateSubBuffer: + clCreateSubBufferOnExit(data, start, end); + break; + case CL_FUNCTION_clCreateSubDevices: + clCreateSubDevicesOnExit(data, start, end); + break; + case CL_FUNCTION_clCreateUserEvent: + clCreateUserEventOnExit(data, start, end); + break; + case CL_FUNCTION_clEnqueueAcquireGLObjects: + clEnqueueAcquireGLObjectsOnExit(data, start, end); + break; + case CL_FUNCTION_clEnqueueBarrier: + clEnqueueBarrierOnExit(data, start, end); + break; + case CL_FUNCTION_clEnqueueBarrierWithWaitList: + clEnqueueBarrierWithWaitListOnExit(data, start, end); + break; + case CL_FUNCTION_clEnqueueCopyBuffer: + clEnqueueCopyBufferOnExit(data, start, end); + break; + case CL_FUNCTION_clEnqueueCopyBufferRect: + clEnqueueCopyBufferRectOnExit(data, start, end); + break; + case CL_FUNCTION_clEnqueueCopyBufferToImage: + clEnqueueCopyBufferToImageOnExit(data, start, end); + break; + case CL_FUNCTION_clEnqueueCopyImage: + clEnqueueCopyImageOnExit(data, start, end); + break; + case CL_FUNCTION_clEnqueueCopyImageToBuffer: + clEnqueueCopyImageToBufferOnExit(data, start, end); + break; + case CL_FUNCTION_clEnqueueFillBuffer: + clEnqueueFillBufferOnExit(data, start, end); + break; + case CL_FUNCTION_clEnqueueFillImage: + clEnqueueFillImageOnExit(data, start, end); + break; + case CL_FUNCTION_clEnqueueMapBuffer: + clEnqueueMapBufferOnExit(data, start, end); + break; + case CL_FUNCTION_clEnqueueMapImage: + clEnqueueMapImageOnExit(data, start, end); + break; + case CL_FUNCTION_clEnqueueMarker: + clEnqueueMarkerOnExit(data, start, end); + break; + case CL_FUNCTION_clEnqueueMarkerWithWaitList: + clEnqueueMarkerWithWaitListOnExit(data, start, end); + break; + case CL_FUNCTION_clEnqueueMigrateMemObjects: + clEnqueueMigrateMemObjectsOnExit(data, start, end); + break; + case CL_FUNCTION_clEnqueueNDRangeKernel: + clEnqueueNDRangeKernelOnExit(data, start, end); + break; + case CL_FUNCTION_clEnqueueNativeKernel: + clEnqueueNativeKernelOnExit(data, start, end); + break; + case CL_FUNCTION_clEnqueueReadBuffer: + clEnqueueReadBufferOnExit(data, start, end); + break; + case CL_FUNCTION_clEnqueueReadBufferRect: + clEnqueueReadBufferRectOnExit(data, start, end); + break; + case CL_FUNCTION_clEnqueueReadImage: + clEnqueueReadImageOnExit(data, start, end); + break; + case CL_FUNCTION_clEnqueueReleaseGLObjects: + clEnqueueReleaseGLObjectsOnExit(data, start, end); + break; + case CL_FUNCTION_clEnqueueSVMFree: + clEnqueueSVMFreeOnExit(data, start, end); + break; + case CL_FUNCTION_clEnqueueSVMMap: + clEnqueueSVMMapOnExit(data, start, end); + break; + case CL_FUNCTION_clEnqueueSVMMemFill: + clEnqueueSVMMemFillOnExit(data, start, end); + break; + case CL_FUNCTION_clEnqueueSVMMemcpy: + clEnqueueSVMMemcpyOnExit(data, start, end); + break; + case CL_FUNCTION_clEnqueueSVMMigrateMem: + clEnqueueSVMMigrateMemOnExit(data, start, end); + break; + case CL_FUNCTION_clEnqueueSVMUnmap: + clEnqueueSVMUnmapOnExit(data, start, end); + break; + case CL_FUNCTION_clEnqueueTask: + clEnqueueTaskOnExit(data, start, end); + break; + case CL_FUNCTION_clEnqueueUnmapMemObject: + clEnqueueUnmapMemObjectOnExit(data, start, end); + break; + case CL_FUNCTION_clEnqueueWaitForEvents: + clEnqueueWaitForEventsOnExit(data, start, end); + break; + case CL_FUNCTION_clEnqueueWriteBuffer: + clEnqueueWriteBufferOnExit(data, start, end); + break; + case CL_FUNCTION_clEnqueueWriteBufferRect: + clEnqueueWriteBufferRectOnExit(data, start, end); + break; + case CL_FUNCTION_clEnqueueWriteImage: + clEnqueueWriteImageOnExit(data, start, end); + break; + case CL_FUNCTION_clFinish: + clFinishOnExit(data, start, end); + break; + case CL_FUNCTION_clFlush: + clFlushOnExit(data, start, end); + break; + case CL_FUNCTION_clGetCommandQueueInfo: + clGetCommandQueueInfoOnExit(data, start, end); + break; + case CL_FUNCTION_clGetContextInfo: + clGetContextInfoOnExit(data, start, end); + break; + case CL_FUNCTION_clGetDeviceAndHostTimer: + clGetDeviceAndHostTimerOnExit(data, start, end); + break; + case CL_FUNCTION_clGetDeviceIDs: + clGetDeviceIDsOnExit(data, start, end); + break; + case CL_FUNCTION_clGetDeviceInfo: + clGetDeviceInfoOnExit(data, start, end); + break; + case CL_FUNCTION_clGetEventInfo: + clGetEventInfoOnExit(data, start, end); + break; + case CL_FUNCTION_clGetEventProfilingInfo: + clGetEventProfilingInfoOnExit(data, start, end); + break; + case CL_FUNCTION_clGetExtensionFunctionAddress: + clGetExtensionFunctionAddressOnExit(data, start, end); + break; + case CL_FUNCTION_clGetExtensionFunctionAddressForPlatform: + clGetExtensionFunctionAddressForPlatformOnExit(data, start, end); + break; + case CL_FUNCTION_clGetGLObjectInfo: + clGetGLObjectInfoOnExit(data, start, end); + break; + case CL_FUNCTION_clGetGLTextureInfo: + clGetGLTextureInfoOnExit(data, start, end); + break; + case CL_FUNCTION_clGetHostTimer: + clGetHostTimerOnExit(data, start, end); + break; + case CL_FUNCTION_clGetImageInfo: + clGetImageInfoOnExit(data, start, end); + break; + case CL_FUNCTION_clGetKernelArgInfo: + clGetKernelArgInfoOnExit(data, start, end); + break; + case CL_FUNCTION_clGetKernelInfo: + clGetKernelInfoOnExit(data, start, end); + break; + case CL_FUNCTION_clGetKernelSubGroupInfo: + clGetKernelSubGroupInfoOnExit(data, start, end); + break; + case CL_FUNCTION_clGetKernelWorkGroupInfo: + clGetKernelWorkGroupInfoOnExit(data, start, end); + break; + case CL_FUNCTION_clGetMemObjectInfo: + clGetMemObjectInfoOnExit(data, start, end); + break; + case CL_FUNCTION_clGetPipeInfo: + clGetPipeInfoOnExit(data, start, end); + break; + case CL_FUNCTION_clGetPlatformIDs: + clGetPlatformIDsOnExit(data, start, end); + break; + case CL_FUNCTION_clGetPlatformInfo: + clGetPlatformInfoOnExit(data, start, end); + break; + case CL_FUNCTION_clGetProgramBuildInfo: + clGetProgramBuildInfoOnExit(data, start, end); + break; + case CL_FUNCTION_clGetProgramInfo: + clGetProgramInfoOnExit(data, start, end); + break; + case CL_FUNCTION_clGetSamplerInfo: + clGetSamplerInfoOnExit(data, start, end); + break; + case CL_FUNCTION_clGetSupportedImageFormats: + clGetSupportedImageFormatsOnExit(data, start, end); + break; + case CL_FUNCTION_clLinkProgram: + clLinkProgramOnExit(data, start, end); + break; + case CL_FUNCTION_clReleaseCommandQueue: + clReleaseCommandQueueOnExit(data, start, end); + break; + case CL_FUNCTION_clReleaseContext: + clReleaseContextOnExit(data, start, end); + break; + case CL_FUNCTION_clReleaseDevice: + clReleaseDeviceOnExit(data, start, end); + break; + case CL_FUNCTION_clReleaseEvent: + clReleaseEventOnExit(data, start, end); + break; + case CL_FUNCTION_clReleaseKernel: + clReleaseKernelOnExit(data, start, end); + break; + case CL_FUNCTION_clReleaseMemObject: + clReleaseMemObjectOnExit(data, start, end); + break; + case CL_FUNCTION_clReleaseProgram: + clReleaseProgramOnExit(data, start, end); + break; + case CL_FUNCTION_clReleaseSampler: + clReleaseSamplerOnExit(data, start, end); + break; + case CL_FUNCTION_clRetainCommandQueue: + clRetainCommandQueueOnExit(data, start, end); + break; + case CL_FUNCTION_clRetainContext: + clRetainContextOnExit(data, start, end); + break; + case CL_FUNCTION_clRetainDevice: + clRetainDeviceOnExit(data, start, end); + break; + case CL_FUNCTION_clRetainEvent: + clRetainEventOnExit(data, start, end); + break; + case CL_FUNCTION_clRetainKernel: + clRetainKernelOnExit(data, start, end); + break; + case CL_FUNCTION_clRetainMemObject: + clRetainMemObjectOnExit(data, start, end); + break; + case CL_FUNCTION_clRetainProgram: + clRetainProgramOnExit(data, start, end); + break; + case CL_FUNCTION_clRetainSampler: + clRetainSamplerOnExit(data, start, end); + break; + case CL_FUNCTION_clSVMAlloc: + clSVMAllocOnExit(data, start, end); + break; + case CL_FUNCTION_clSVMFree: + clSVMFreeOnExit(data, start, end); + break; + case CL_FUNCTION_clSetCommandQueueProperty: + clSetCommandQueuePropertyOnExit(data, start, end); + break; + case CL_FUNCTION_clSetDefaultDeviceCommandQueue: + clSetDefaultDeviceCommandQueueOnExit(data, start, end); + break; + case CL_FUNCTION_clSetEventCallback: + clSetEventCallbackOnExit(data, start, end); + break; + case CL_FUNCTION_clSetKernelArg: + clSetKernelArgOnExit(data, start, end); + break; + case CL_FUNCTION_clSetKernelArgSVMPointer: + clSetKernelArgSVMPointerOnExit(data, start, end); + break; + case CL_FUNCTION_clSetKernelExecInfo: + clSetKernelExecInfoOnExit(data, start, end); + break; + case CL_FUNCTION_clSetMemObjectDestructorCallback: + clSetMemObjectDestructorCallbackOnExit(data, start, end); + break; + case CL_FUNCTION_clSetUserEventStatus: + clSetUserEventStatusOnExit(data, start, end); + break; + case CL_FUNCTION_clUnloadCompiler: + clUnloadCompilerOnExit(data, start, end); + break; + case CL_FUNCTION_clUnloadPlatformCompiler: + clUnloadPlatformCompilerOnExit(data, start, end); + break; + case CL_FUNCTION_clWaitForEvents: + clWaitForEventsOnExit(data, start, end); + break; + default: + break; + } +} + +#endif // PTI_SAMPLES_CL_HOT_FUNCTIONS_CL_API_CALLBACKS_H_ \ No newline at end of file diff --git a/samples/cl_hot_functions/cl_api_collector.h b/samples/cl_hot_functions/cl_api_collector.h index 6db98f1..86c3e86 100644 --- a/samples/cl_hot_functions/cl_api_collector.h +++ b/samples/cl_hot_functions/cl_api_collector.h @@ -1,5 +1,5 @@ //============================================================== -// Copyright © 2020 Intel Corporation +// Copyright (C) Intel Corporation // // SPDX-License-Identifier: MIT // ============================================================= @@ -14,23 +14,26 @@ #include #include -#include "cl_tracer.h" +#include "cl_api_tracer.h" #include "cl_utils.h" +#include "trace_guard.h" -struct Function { +#include "cl_api_callbacks.h" + +struct ClFunction { uint64_t total_time; uint64_t min_time; uint64_t max_time; uint64_t call_count; - bool operator>(const Function& r) const { + bool operator>(const ClFunction& r) const { if (total_time != r.total_time) { return total_time > r.total_time; } return call_count > r.call_count; } - bool operator!=(const Function& r) const { + bool operator!=(const ClFunction& r) const { if (total_time == r.total_time) { return call_count != r.call_count; } @@ -38,20 +41,29 @@ struct Function { } }; -using FunctionInfoMap = std::map; -using FunctionTimePoint = std::chrono::time_point; +using ClFunctionInfoMap = std::map; +using ClFunctionTimePoint = std::chrono::time_point; + +typedef void (*OnClFunctionFinishCallback)( + void* data, const std::string& name, + uint64_t started, uint64_t ended); class ClApiCollector { public: // User Interface static ClApiCollector* Create( cl_device_id device, - FunctionTimePoint base_time = std::chrono::steady_clock::now()) { + ClFunctionTimePoint base_time = std::chrono::steady_clock::now(), + bool call_tracing = false, + OnClFunctionFinishCallback callback = nullptr, + void* callback_data = nullptr) { PTI_ASSERT(device != nullptr); + TraceGuard guard; - ClApiCollector* collector = new ClApiCollector(base_time); + ClApiCollector* collector = new ClApiCollector( + base_time, call_tracing, callback, callback_data); PTI_ASSERT(collector != nullptr); - ClTracer* tracer = new ClTracer(device, Callback, collector); + ClApiTracer* tracer = new ClApiTracer(device, Callback, collector); if (tracer == nullptr || !tracer->IsValid()) { std::cerr << "[WARNING] Unable to create OpenCL tracer " << "for target device" << std::endl; @@ -78,15 +90,15 @@ class ClApiCollector { PTI_ASSERT(disabled); } - const FunctionInfoMap& GetFunctionInfoMap() const { + const ClFunctionInfoMap& GetFunctionInfoMap() const { return function_info_map_; } ClApiCollector(const ClApiCollector& copy) = delete; ClApiCollector& operator=(const ClApiCollector& copy) = delete; - static void PrintFunctionsTable(const FunctionInfoMap& function_info_map) { - std::set< std::pair, + static void PrintFunctionsTable(const ClFunctionInfoMap& function_info_map) { + std::set< std::pair, utils::Comparator > sorted_list( function_info_map.begin(), function_info_map.end()); @@ -131,9 +143,13 @@ class ClApiCollector { } private: // Implementation Details - ClApiCollector(FunctionTimePoint base_time) : base_time_(base_time) {} + ClApiCollector( + ClFunctionTimePoint base_time, bool call_tracing, + OnClFunctionFinishCallback callback, void* callback_data) + : base_time_(base_time), call_tracing_(call_tracing), + callback_(callback), callback_data_(callback_data) {} - void EnableTracing(ClTracer* tracer) { + void EnableTracing(ClApiTracer* tracer) { PTI_ASSERT(tracer != nullptr); tracer_ = tracer; @@ -157,7 +173,7 @@ class ClApiCollector { if (function_info_map_.count(name) == 0) { function_info_map_[name] = {time, time, time, 1}; } else { - Function& function = function_info_map_[name]; + ClFunction& function = function_info_map_[name]; function.total_time += time; if (time < function.min_time) { function.min_time = time; @@ -170,43 +186,56 @@ class ClApiCollector { } private: // Callbacks - static void OnFunctionEnter(cl_callback_data* data, void* user_data) { - ClApiCollector* collector = reinterpret_cast(user_data); - PTI_ASSERT(collector != nullptr); - - PTI_ASSERT(data != nullptr); - uint64_t& start_time = *reinterpret_cast(data->correlationData); - start_time = collector->GetTimestamp(); - } - - static void OnFunctionExit(cl_callback_data* data, void* user_data) { - ClApiCollector* collector = reinterpret_cast(user_data); - PTI_ASSERT(collector != nullptr); - uint64_t end_time = collector->GetTimestamp(); - - PTI_ASSERT(data != nullptr); - uint64_t& start_time = *reinterpret_cast(data->correlationData); - collector->AddFunctionTime(data->functionName, end_time - start_time); - } - static void Callback( cl_function_id function, cl_callback_data* callback_data, void* user_data) { + if (TraceGuard::Inactive()) return; + + ClApiCollector* collector = reinterpret_cast(user_data); + PTI_ASSERT(collector != nullptr); + PTI_ASSERT(callback_data != nullptr); + PTI_ASSERT(callback_data->correlationData != nullptr); + if (callback_data->site == CL_CALLBACK_SITE_ENTER) { - OnFunctionEnter(callback_data, user_data); + uint64_t& start_time = *reinterpret_cast( + callback_data->correlationData); + start_time = collector->GetTimestamp(); + + if (collector->call_tracing_) { + OnEnterFunction(function, callback_data, start_time); + } } else { - OnFunctionExit(callback_data, user_data); + uint64_t end_time = collector->GetTimestamp(); + + uint64_t& start_time = *reinterpret_cast( + callback_data->correlationData); + collector->AddFunctionTime( + callback_data->functionName, end_time - start_time); + + if (collector->call_tracing_) { + OnExitFunction(function, callback_data, start_time, end_time); + } + + if (collector->callback_ != nullptr) { + collector->callback_( + collector->callback_data_, callback_data->functionName, + start_time, end_time); + } } } private: // Data - ClTracer* tracer_ = nullptr; + ClApiTracer* tracer_ = nullptr; + + ClFunctionTimePoint base_time_; + bool call_tracing_ = false; - FunctionTimePoint base_time_; + OnClFunctionFinishCallback callback_ = nullptr; + void* callback_data_ = nullptr; std::mutex lock_; - FunctionInfoMap function_info_map_; + ClFunctionInfoMap function_info_map_; static const uint32_t kFunctionLength = 10; static const uint32_t kCallsLength = 12; diff --git a/samples/cl_hot_functions/tool.cc b/samples/cl_hot_functions/tool.cc index 5d64172..9bfd749 100644 --- a/samples/cl_hot_functions/tool.cc +++ b/samples/cl_hot_functions/tool.cc @@ -1,5 +1,5 @@ //============================================================== -// Copyright © 2019-2020 Intel Corporation +// Copyright (C) Intel Corporation // // SPDX-License-Identifier: MIT // ============================================================= @@ -51,7 +51,7 @@ static uint64_t CalculateTotalTime(ClApiCollector* collector) { PTI_ASSERT(collector != nullptr); uint64_t total_duration = 0; - const FunctionInfoMap& function_info_map = collector->GetFunctionInfoMap(); + const ClFunctionInfoMap& function_info_map = collector->GetFunctionInfoMap(); if (function_info_map.size() != 0) { for (auto& value : function_info_map) { total_duration += value.second.total_time; @@ -72,7 +72,7 @@ static void PrintDeviceTable( std::cerr << "== " << device_type << " Backend: ==" << std::endl; std::cerr << std::endl; - const FunctionInfoMap& function_info_map = collector->GetFunctionInfoMap(); + const ClFunctionInfoMap& function_info_map = collector->GetFunctionInfoMap(); PTI_ASSERT(function_info_map.size() > 0); ClApiCollector::PrintFunctionsTable(function_info_map); } diff --git a/samples/cl_hot_kernels/CMakeLists.txt b/samples/cl_hot_kernels/CMakeLists.txt index e958ac5..5c52014 100644 --- a/samples/cl_hot_kernels/CMakeLists.txt +++ b/samples/cl_hot_kernels/CMakeLists.txt @@ -8,7 +8,10 @@ SetBuildType() # Tool Library -add_library(clt_hot_kernels SHARED "${PROJECT_SOURCE_DIR}/../loader/init.cc" tool.cc) +add_library(clt_hot_kernels SHARED + "${PROJECT_SOURCE_DIR}/../utils/trace_guard.cc" + "${PROJECT_SOURCE_DIR}/../loader/init.cc" + tool.cc) target_include_directories(clt_hot_kernels PRIVATE "${PROJECT_SOURCE_DIR}/../utils") if(CMAKE_INCLUDE_PATH) diff --git a/samples/cl_hot_kernels/cl_kernel_collector.h b/samples/cl_hot_kernels/cl_kernel_collector.h index aa3b32f..5a40d80 100644 --- a/samples/cl_hot_kernels/cl_kernel_collector.h +++ b/samples/cl_hot_kernels/cl_kernel_collector.h @@ -1,5 +1,5 @@ //============================================================== -// Copyright © 2020 Intel Corporation +// Copyright (C) Intel Corporation // // SPDX-License-Identifier: MIT // ============================================================= @@ -13,27 +13,28 @@ #include #include -#include "cl_tracer.h" +#include "cl_api_tracer.h" #include "cl_utils.h" +#include "trace_guard.h" class ClKernelCollector; -enum KernelType { +enum ClKernelType { KERNEL_TYPE_USER, KERNEL_TYPE_TRANSFER }; -struct EventData { +struct ClEventData { ClKernelCollector* collector; std::string kernel_name; - KernelType kernel_type; + ClKernelType kernel_type; union { cl_kernel kernel; size_t bytes_transferred; }; }; -struct KernelInfo { +struct ClKernelInfo { uint64_t total_time; uint64_t min_time; uint64_t max_time; @@ -41,14 +42,14 @@ struct KernelInfo { size_t simd_width; size_t bytes_transferred; - bool operator>(const KernelInfo& r) const { + bool operator>(const ClKernelInfo& r) const { if (total_time != r.total_time) { return total_time > r.total_time; } return call_count > r.call_count; } - bool operator!=(const KernelInfo& r) const { + bool operator!=(const ClKernelInfo& r) const { if (total_time == r.total_time) { return call_count != r.call_count; } @@ -56,24 +57,36 @@ struct KernelInfo { } }; -struct KernelInterval { +struct ClKernelInterval { std::string name; uint64_t start; uint64_t end; }; -using KernelInfoMap = std::map; -using KernelIntervalList = std::vector; +using ClKernelInfoMap = std::map; +using ClKernelIntervalList = std::vector; +using ClKernelTimePoint = std::chrono::time_point; + +typedef void (*OnClKernelFinishCallback)( + void* data, void* queue, const std::string& name, + uint64_t queued, uint64_t submitted, + uint64_t started, uint64_t ended); class ClKernelCollector { - public: // User Interface - static ClKernelCollector* Create(cl_device_id device) { + public: // Interface + static ClKernelCollector* Create( + cl_device_id device, + ClKernelTimePoint base_time = std::chrono::steady_clock::now(), + OnClKernelFinishCallback callback = nullptr, + void* callback_data = nullptr) { PTI_ASSERT(device != nullptr); + TraceGuard guard; - ClKernelCollector* collector = new ClKernelCollector(); + ClKernelCollector* collector = new ClKernelCollector( + device, base_time, callback, callback_data); PTI_ASSERT(collector != nullptr); - ClTracer* tracer = new ClTracer(device, Callback, collector); + ClApiTracer* tracer = new ClApiTracer(device, Callback, collector); if (tracer == nullptr || !tracer->IsValid()) { std::cerr << "[WARNING] Unable to create OpenCL tracer " << "for target device" << std::endl; @@ -100,19 +113,19 @@ class ClKernelCollector { PTI_ASSERT(disabled); } - const KernelInfoMap& GetKernelInfoMap() const { + const ClKernelInfoMap& GetKernelInfoMap() const { return kernel_info_map_; } - const KernelIntervalList& GetKernelIntervalList() const { + const ClKernelIntervalList& GetKernelIntervalList() const { return kernel_interval_list_; } ClKernelCollector(const ClKernelCollector& copy) = delete; ClKernelCollector& operator=(const ClKernelCollector& copy) = delete; - static void PrintKernelsTable(const KernelInfoMap& kernel_info_map) { - std::set< std::pair, + static void PrintKernelsTable(const ClKernelInfoMap& kernel_info_map) { + std::set< std::pair, utils::Comparator > sorted_list( kernel_info_map.begin(), kernel_info_map.end()); @@ -165,9 +178,27 @@ class ClKernelCollector { } private: // Implementation Details - ClKernelCollector() {} + ClKernelCollector( + cl_device_id device, + ClKernelTimePoint base_time, + OnClKernelFinishCallback callback, + void* callback_data) + : base_time_(base_time), + callback_(callback), + callback_data_(callback_data) { + if (callback_ != nullptr) { + cl_device_type device_type = utils::cl::GetDeviceType(device); + if (device_type == CL_DEVICE_TYPE_GPU) { + dev_timestamp_ = utils::cl::GetGpuTimestamp(); + } else { + PTI_ASSERT(device_type == CL_DEVICE_TYPE_CPU); + dev_timestamp_ = utils::cl::GetCpuTimestamp(); + } + cpu_timestamp_ = std::chrono::steady_clock::now(); + } + } - void EnableTracing(ClTracer* tracer) { + void EnableTracing(ClApiTracer* tracer) { PTI_ASSERT(tracer != nullptr); tracer_ = tracer; @@ -192,7 +223,7 @@ class ClKernelCollector { kernel_info_map_[name] = { time, time, time, 1, simd_width, bytes_transferred}; } else { - KernelInfo& kernel = kernel_info_map_[name]; + ClKernelInfo& kernel = kernel_info_map_[name]; kernel.total_time += time; if (time > kernel.max_time) { kernel.max_time = time; @@ -216,16 +247,29 @@ class ClKernelCollector { static void CL_CALLBACK EventNotify( cl_event event, cl_int event_status, void* user_data) { PTI_ASSERT(event_status == CL_COMPLETE); + TraceGuard guard; PTI_ASSERT(user_data != nullptr); - EventData* event_data = reinterpret_cast(user_data); + ClEventData* event_data = reinterpret_cast(user_data); + + ClKernelCollector* collector = event_data->collector; + PTI_ASSERT(collector != nullptr); + + cl_command_queue queue = utils::cl::GetCommandQueue(event); + PTI_ASSERT(queue != nullptr); + + std::string name = event_data->kernel_name; + PTI_ASSERT(!name.empty()); + + cl_ulong started = + utils::cl::GetEventTimestamp(event, CL_PROFILING_COMMAND_START); + cl_ulong ended = + utils::cl::GetEventTimestamp(event, CL_PROFILING_COMMAND_END); + cl_ulong time = ended - started; + PTI_ASSERT(time > 0); if (event_data->kernel_type == KERNEL_TYPE_USER) { cl_kernel kernel = event_data->kernel; - std::string name = event_data->kernel_name; - - cl_command_queue queue = utils::cl::GetCommandQueue(event); - PTI_ASSERT(queue != nullptr); cl_device_id device = utils::cl::GetDevice(queue); PTI_ASSERT(device != nullptr); @@ -233,40 +277,54 @@ class ClKernelCollector { size_t simd_width = utils::cl::GetSimdWidth(device, kernel); PTI_ASSERT(simd_width > 0); - cl_ulong start = utils::cl::GetEventStartTime(event); - cl_ulong end = utils::cl::GetEventEndTime(event); - cl_ulong time = end - start; - PTI_ASSERT(time > 0); - cl_int status = clReleaseKernel(kernel); PTI_ASSERT(status == CL_SUCCESS); - status = clReleaseEvent(event); - PTI_ASSERT(status == CL_SUCCESS); - - PTI_ASSERT(event_data->collector != nullptr); - event_data->collector->AddKernelInfo(name, time, simd_width, 0); - event_data->collector->AddKernelInterval(name, start, end); + collector->AddKernelInfo(name, time, simd_width, 0); + collector->AddKernelInterval(name, started, ended); } else { PTI_ASSERT(event_data->kernel_type == KERNEL_TYPE_TRANSFER); - std::string name = event_data->kernel_name; size_t bytes_transferred = event_data->bytes_transferred; PTI_ASSERT(bytes_transferred > 0); - cl_ulong start = utils::cl::GetEventStartTime(event); - cl_ulong end = utils::cl::GetEventEndTime(event); - cl_ulong time = end - start; - PTI_ASSERT(time > 0); - - cl_int status = clReleaseEvent(event); - PTI_ASSERT(status == CL_SUCCESS); + collector->AddKernelInfo(name, time, 0, bytes_transferred); + } - PTI_ASSERT(event_data->collector != nullptr); - event_data->collector->AddKernelInfo(name, time, 0, bytes_transferred); + if (collector->callback_ != nullptr) { + cl_ulong queued = + utils::cl::GetEventTimestamp(event, CL_PROFILING_COMMAND_QUEUED); + PTI_ASSERT(queued > 0); + cl_ulong submitted = + utils::cl::GetEventTimestamp(event, CL_PROFILING_COMMAND_SUBMIT); + PTI_ASSERT(submitted > 0); + + std::chrono::duration time_shift = + collector->cpu_timestamp_ - collector->base_time_; + + PTI_ASSERT(collector->dev_timestamp_ < queued); + PTI_ASSERT(queued < submitted); + PTI_ASSERT(submitted < started); + PTI_ASSERT(started < ended); + + uint64_t cpu_queued = + (queued - collector->dev_timestamp_) + time_shift.count(); + uint64_t cpu_submitted = + (submitted - collector->dev_timestamp_) + time_shift.count(); + uint64_t cpu_started = + (started - collector->dev_timestamp_) + time_shift.count(); + uint64_t cpu_ended = + (ended - collector->dev_timestamp_) + time_shift.count(); + + collector->callback_( + collector->callback_data_, queue, name, + cpu_queued, cpu_submitted, cpu_started, cpu_ended); } + cl_int status = clReleaseEvent(event); + PTI_ASSERT(status == CL_SUCCESS); + delete event_data; } @@ -338,7 +396,7 @@ class ClKernelCollector { PTI_ASSERT(status == CL_SUCCESS); } - EventData* event_data = new EventData; + ClEventData* event_data = new ClEventData; PTI_ASSERT(event_data != nullptr); cl_kernel kernel = *(params->kernel); event_data->collector = collector; @@ -388,7 +446,7 @@ class ClKernelCollector { PTI_ASSERT(status == CL_SUCCESS); } - EventData* event_data = new EventData; + ClEventData* event_data = new ClEventData; PTI_ASSERT(event_data != nullptr); event_data->collector = collector; event_data->kernel_name = "clEnqueueReadBuffer"; @@ -435,7 +493,7 @@ class ClKernelCollector { PTI_ASSERT(status == CL_SUCCESS); } - EventData* event_data = new EventData; + ClEventData* event_data = new ClEventData; PTI_ASSERT(event_data != nullptr); event_data->collector = collector; event_data->kernel_name = "clEnqueueWriteBuffer"; @@ -451,6 +509,9 @@ class ClKernelCollector { static void Callback(cl_function_id function, cl_callback_data* callback_data, void* user_data) { + if (TraceGuard::Inactive()) return; + TraceGuard guard; + ClKernelCollector* collector = reinterpret_cast(user_data); PTI_ASSERT(collector != nullptr); @@ -487,11 +548,18 @@ class ClKernelCollector { } private: // Data - ClTracer* tracer_ = nullptr; + ClApiTracer* tracer_ = nullptr; + ClKernelTimePoint base_time_; + + OnClKernelFinishCallback callback_ = nullptr; + void* callback_data_ = nullptr; + + ClKernelTimePoint cpu_timestamp_; + uint64_t dev_timestamp_ = 0; std::mutex lock_; - KernelInfoMap kernel_info_map_; - KernelIntervalList kernel_interval_list_; + ClKernelInfoMap kernel_info_map_; + ClKernelIntervalList kernel_interval_list_; static const uint32_t kKernelLength = 10; static const uint32_t kCallsLength = 12; diff --git a/samples/cl_hot_kernels/tool.cc b/samples/cl_hot_kernels/tool.cc index 7a60797..36b974f 100644 --- a/samples/cl_hot_kernels/tool.cc +++ b/samples/cl_hot_kernels/tool.cc @@ -1,5 +1,5 @@ //============================================================== -// Copyright © 2019-2020 Intel Corporation +// Copyright (C) Intel Corporation // // SPDX-License-Identifier: MIT // ============================================================= @@ -50,7 +50,7 @@ static uint64_t CalculateTotalTime(ClKernelCollector* collector) { PTI_ASSERT(collector != nullptr); uint64_t total_duration = 0; - const KernelInfoMap& kernel_info_map = collector->GetKernelInfoMap(); + const ClKernelInfoMap& kernel_info_map = collector->GetKernelInfoMap(); if (kernel_info_map.size() != 0) { for (auto& value : kernel_info_map) { total_duration += value.second.total_time; @@ -71,7 +71,7 @@ static void PrintDeviceTable( std::cerr << "== " << device_type << " Backend: ==" << std::endl; std::cerr << std::endl; - const KernelInfoMap& kernel_info_map = collector->GetKernelInfoMap(); + const ClKernelInfoMap& kernel_info_map = collector->GetKernelInfoMap(); PTI_ASSERT(kernel_info_map.size() > 0); ClKernelCollector::PrintKernelsTable(kernel_info_map); } diff --git a/samples/cl_tracer/CMakeLists.txt b/samples/cl_tracer/CMakeLists.txt new file mode 100644 index 0000000..53d413b --- /dev/null +++ b/samples/cl_tracer/CMakeLists.txt @@ -0,0 +1,39 @@ +include("../build_utils/CMakeLists.txt") +SetRequiredCMakeVersion() +cmake_minimum_required(VERSION ${REQUIRED_CMAKE_VERSION}) + +project(PTI_Samples_CL_Tracer CXX) +SetCompilerFlags() +SetBuildType() + +# Tool Library + +add_library(clt_tracer SHARED + "${PROJECT_SOURCE_DIR}/../utils/trace_guard.cc" + "${PROJECT_SOURCE_DIR}/../loader/init.cc" + tool.cc) +target_include_directories(clt_tracer + PRIVATE "${PROJECT_SOURCE_DIR}" + PRIVATE "${PROJECT_SOURCE_DIR}/../utils" + PRIVATE "${PROJECT_SOURCE_DIR}/../cl_hot_functions" + PRIVATE "${PROJECT_SOURCE_DIR}/../cl_hot_kernels") +if(CMAKE_INCLUDE_PATH) + target_include_directories(clt_tracer + PUBLIC "${CMAKE_INCLUDE_PATH}") +endif() + +FindOpenCLLibrary(clt_tracer) +FindOpenCLHeaders(clt_tracer) + +GetOpenCLTracingHeaders(clt_tracer) + +# Loader + +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DTOOL_NAME=clt_tracer") +add_executable(cl_tracer "${PROJECT_SOURCE_DIR}/../loader/loader.cc") +target_include_directories(cl_tracer + PRIVATE "${PROJECT_SOURCE_DIR}/../utils") +if(UNIX) + target_link_libraries(cl_tracer + dl) +endif() \ No newline at end of file diff --git a/samples/cl_tracer/README.md b/samples/cl_tracer/README.md new file mode 100644 index 0000000..d1169d3 --- /dev/null +++ b/samples/cl_tracer/README.md @@ -0,0 +1,129 @@ +# OpenCL(TM) Tracer +## Overview +This tool is an analogue of [Intercept Layer for OpenCL(TM) Applications](https://github.com/intel/opencl-intercept-layer) designed based on internal [tracing mechanism](../../chapters/runtime_api_tracing/OpenCL.md) implemented in Intel runtimes for OpenCL(TM). + +The following capabilities are available currently: +``` +Usage: ./cl_tracer[.exe] [options] +Options: +--call-logging [-c] Trace host API calls +--host-timing [-h] Report host API execution time +--device-timing [-d] Report kernels exectucion time +--device-timeline [-t] Trace device activities +--chrome-device-timeline Dump device activities to JSON file +--chrome-call-logging Dump host API calls to JSON file +``` + +**Call Logging** mode allows to grab full host API trace, e.g.: +``` +... +>>>> [271632470] clCreateBuffer: context = 0x5591dba3f860 flags = 4 size = 4194304 hostPtr = 0 errcodeRet = 0x7ffd334b2f04 +<<<< [271640078] clCreateBuffer [7608 ns] result = 0x5591dbaa5760 -> CL_SUCCESS (0) +>>>> [272171119] clEnqueueWriteBuffer: commandQueue = 0x5591dbf4be70 buffer = 0x5591dbaa5760 blockingWrite = 1 offset = 0 cb = 4194304 ptr = 0x5591dc92af90 numEventsInWaitList = 0 eventWaitList = 0 event = 0 +<<<< [272698660] clEnqueueWriteBuffer [527541 ns] -> CL_SUCCESS (0) +>>>> [272716922] clSetKernelArg: kernel = 0x5591dc500c60 argIndex = 0 argSize = 8 argValue = 0x7ffd334b2f10 +<<<< [272724034] clSetKernelArg [7112 ns] -> CL_SUCCESS (0) +>>>> [272729938] clSetKernelArg: kernel = 0x5591dc500c60 argIndex = 1 argSize = 8 argValue = 0x7ffd334b2f18 +<<<< [272733712] clSetKernelArg [3774 ns] -> CL_SUCCESS (0) +... +``` +**Chrome Call Logging** mode dumps API calls to JSON format that can be opened in [chrome://tracing](https://www.chromium.org/developers/how-tos/trace-event-profiling-tool) browser tool. + +**Host Timing** mode collects duration for each API call and provides the summary for the whole application: +``` +=== API Timing Results: === + + Total Execution Time (ns): 366500174 +Total API Time for CPU backend (ns): 16851 +Total API Time for GPU backend (ns): 357744252 + +== CPU Backend: == + + Function, Calls, Time (ns), Time (%), Average (ns), Min (ns), Max (ns) +clGetDeviceIDs, 1, 16851, 100.00, 16851, 16851, 16851 + +== GPU Backend: == + + Function, Calls, Time (ns), Time (%), Average (ns), Min (ns), Max (ns) + clFinish, 4, 174933263, 48.90, 43733315, 42966659, 44067629 + clBuildProgram, 1, 172466699, 48.21, 172466699, 172466699, 172466699 + clEnqueueWriteBuffer, 8, 3788816, 1.06, 473602, 367912, 593802 + clEnqueueNDRangeKernel, 4, 3238743, 0.91, 809685, 208889, 2562605 +... +``` +**Device Timing** mode collects duration for each kernel on the device and provides the summary for the whole application: +``` +=== Device Timing Results: === + + Total Execution Time (ns): 366500174 +Total Device Time for CPU backend (ns): 0 +Total Device Time for GPU backend (ns): 180543441 + +== GPU Backend: == + + Kernel, Calls, SIMD, Transferred (bytes), Time (ns), Time (%), Average (ns), Min (ns), Max (ns) + GEMM, 4, 32, 0, 174210248, 96.49, 43552562, 42764416, 43851333 +clEnqueueWriteBuffer, 8, 0, 33554432, 3683507, 2.04, 460438, 355983, 584325 + clEnqueueReadBuffer, 4, 0, 16777216, 2649686, 1.47, 662421, 607215, 702940 +... +``` +**Device Timeline** mode dumps four timestamps for each device activity - *queued* to the host command queue, *submit* to device queue, *start* and *end* on the device (all the timestamps are in CPU nanoseconds): +``` +... +Device Timeline (queue: 0x55a9c7e51e70): clEnqueueWriteBuffer [ns] = 317341082 (queued) 317355010 (submit) 317452332 (start) 317980165 (end) +Device Timeline (queue: 0x55a9c7e51e70): clEnqueueWriteBuffer [ns] = 317789774 (queued) 317814558 (submit) 318160607 (start) 318492690 (end) +Device Timeline (queue: 0x55a9c7e51e70): GEMM [ns] = 318185764 (queued) 318200629 (submit) 318550014 (start) 361260930 (end) +Device Timeline (queue: 0x55a9c7e51e70): clEnqueueReadBuffer [ns] = 361479600 (queued) 361481387 (submit) 361482574 (start) 362155593 (end) +... +``` +**Chrome Device Timeline** mode dumps timestamps for device activities to JSON format that can be opened in [chrome://tracing](https://www.chromium.org/developers/how-tos/trace-event-profiling-tool) browser tool. + +## Supported OS +- Linux +- Windows (*under development*) + +## Prerequisites +- [CMake](https://cmake.org/) (version 2.8 and above) +- [Git](https://git-scm.com/) (version 1.8 and above) +- [Python](https://www.python.org/) (version 2.7 and above) +- [OpenCL(TM) ICD Loader](https://github.com/KhronosGroup/OpenCL-ICD-Loader) +- [Intel(R) Graphics Compute Runtime for oneAPI Level Zero and OpenCL(TM) Driver](https://github.com/intel/compute-runtime) to run on GPU +- [Intel(R) Xeon(R) Processor / Intel(R) Core(TM) Processor (CPU) Runtimes](https://software.intel.com/en-us/articles/opencl-drivers#cpu-section) to run on CPU + +## Build and Run +### Linux +Run the following commands to build the sample: +```sh +cd /samples/cl_tracer +mkdir build +cd build +cmake -DCMAKE_BUILD_TYPE=Release .. +make +``` +Use this command line to run the tool: +```sh +./cl_tracer [options] +``` +One may use [cl_gemm](../cl_gemm) or [dpc_gemm](../dpc_gemm) as target application, e.g.: +```sh +./cl_tracer -c -h ../../cl_gemm/build/cl_gemm +./cl_tracer -c -h ../../dpc_gemm/build/dpc_gemm cpu +``` +### Windows +Use Microsoft* Visual Studio x64 command prompt to run the following commands and build the sample: +```sh +cd \samples\cl_tracer +mkdir build +cd build +cmake -G "NMake Makefiles" -DCMAKE_BUILD_TYPE=Release -DCMAKE_LIBRARY_PATH= .. +nmake +``` +Use this command line to run the tool: +```sh +cl_tracer.exe [options] +``` +One may use [cl_gemm](../cl_gemm) or [dpc_gemm](../dpc_gemm) as target application, e.g.: +```sh +cl_tracer.exe -c -h ..\..\cl_gemm\build\cl_gemm.exe +cl_tracer.exe -c -h ..\..\dpc_gemm\build\dpc_gemm.exe cpu +``` \ No newline at end of file diff --git a/samples/cl_tracer/cl_tracer.h b/samples/cl_tracer/cl_tracer.h new file mode 100644 index 0000000..f611d6f --- /dev/null +++ b/samples/cl_tracer/cl_tracer.h @@ -0,0 +1,397 @@ +//============================================================== +// Copyright (C) Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= + +#ifndef PTI_SAMPLES_CL_TRACER_CL_TRACER_H_ +#define PTI_SAMPLES_CL_TRACER_CL_TRACER_H_ + +#include +#include +#include +#include +#include +#include +#include + +#include "cl_api_collector.h" +#include "cl_kernel_collector.h" +#include "utils.h" + +#define CLT_CALL_LOGGING 0 +#define CLT_HOST_TIMING 1 +#define CLT_DEVICE_TIMING 2 +#define CLT_DEVICE_TIMELINE 3 +#define CLT_CHROME_DEVICE_TIMELINE 4 +#define CLT_CHROME_CALL_LOGGING 5 + +const char* kChromeTraceFileName = "cli_trace.json"; + +class ClTracer { + public: + static ClTracer* Create(unsigned options) { + cl_device_id cpu_device = utils::cl::GetIntelDevice(CL_DEVICE_TYPE_CPU); + cl_device_id gpu_device = utils::cl::GetIntelDevice(CL_DEVICE_TYPE_GPU); + if (cpu_device == nullptr && gpu_device == nullptr) { + std::cerr << "[WARNING] Intel OpenCL devices are not found" << std::endl; + return nullptr; + } + + ClTracer* tracer = new ClTracer(options); + + if (tracer->CheckOption(CLT_CALL_LOGGING) || + tracer->CheckOption(CLT_CHROME_CALL_LOGGING) || + tracer->CheckOption(CLT_HOST_TIMING)) { + + ClApiCollector* cpu_api_collector = nullptr; + ClApiCollector* gpu_api_collector = nullptr; + + OnClFunctionFinishCallback callback = nullptr; + if (tracer->CheckOption(CLT_CHROME_CALL_LOGGING)) { + callback = ChromeLoggingCallback; + } + + bool call_tracing = tracer->CheckOption(CLT_CALL_LOGGING); + + if (cpu_device != nullptr) { + cpu_api_collector = ClApiCollector::Create( + cpu_device, tracer->start_time_, + call_tracing, callback, tracer); + if (cpu_api_collector == nullptr) { + std::cerr << + "[WARNING] Unable to create API collector for CPU backend" << + std::endl; + } + tracer->cpu_api_collector_ = cpu_api_collector; + } + + if (gpu_device != nullptr) { + gpu_api_collector = ClApiCollector::Create( + gpu_device, tracer->start_time_, + call_tracing, callback, tracer); + if (gpu_api_collector == nullptr) { + std::cerr << + "[WARNING] Unable to create API collector for GPU backend" << + std::endl; + } + tracer->gpu_api_collector_ = gpu_api_collector; + } + + if (gpu_api_collector == nullptr && cpu_api_collector == nullptr) { + delete tracer; + return nullptr; + } + } + + if (tracer->CheckOption(CLT_DEVICE_TIMELINE) || + tracer->CheckOption(CLT_CHROME_DEVICE_TIMELINE) || + tracer->CheckOption(CLT_DEVICE_TIMING)) { + + ClKernelCollector* cpu_kernel_collector = nullptr; + ClKernelCollector* gpu_kernel_collector = nullptr; + + OnClKernelFinishCallback callback = nullptr; + if (tracer->CheckOption(CLT_DEVICE_TIMELINE) && + tracer->CheckOption(CLT_CHROME_DEVICE_TIMELINE)) { + callback = DeviceAndChromeTimelineCallback; + } else if (tracer->CheckOption(CLT_DEVICE_TIMELINE)) { + callback = DeviceTimelineCallback; + } else if (tracer->CheckOption(CLT_CHROME_DEVICE_TIMELINE)) { + callback = ChromeTimelineCallback; + } + + if (cpu_device != nullptr) { + cpu_kernel_collector = ClKernelCollector::Create( + cpu_device, tracer->start_time_, callback, tracer); + if (cpu_kernel_collector == nullptr) { + std::cerr << + "[WARNING] Unable to create kernel collector for CPU backend" << + std::endl; + } + tracer->cpu_kernel_collector_ = cpu_kernel_collector; + } + + if (gpu_device != nullptr) { + gpu_kernel_collector = ClKernelCollector::Create( + gpu_device, tracer->start_time_, callback, tracer); + if (gpu_kernel_collector == nullptr) { + std::cerr << + "[WARNING] Unable to create kernel collector for GPU backend" << + std::endl; + } + tracer->gpu_kernel_collector_ = gpu_kernel_collector; + } + + if (cpu_kernel_collector == nullptr && gpu_kernel_collector == nullptr) { + delete tracer; + return nullptr; + } + } + + return tracer; + } + + ~ClTracer() { + std::chrono::steady_clock::time_point end_time = + std::chrono::steady_clock::now(); + std::chrono::duration duration = + end_time - start_time_; + total_execution_time_ = duration.count(); + + if (cpu_api_collector_ != nullptr) { + cpu_api_collector_->DisableTracing(); + } + if (gpu_api_collector_ != nullptr) { + gpu_api_collector_->DisableTracing(); + } + + if (cpu_kernel_collector_ != nullptr) { + cpu_kernel_collector_->DisableTracing(); + } + if (gpu_kernel_collector_ != nullptr) { + gpu_kernel_collector_->DisableTracing(); + } + + Report(); + + if (cpu_api_collector_ != nullptr) { + delete cpu_api_collector_; + } + if (gpu_api_collector_ != nullptr) { + delete gpu_api_collector_; + } + + if (cpu_kernel_collector_ != nullptr) { + delete cpu_kernel_collector_; + } + if (gpu_kernel_collector_ != nullptr) { + delete gpu_kernel_collector_; + } + + if (chrome_trace_.is_open()) { + CloseTraceFile(); + } + } + + bool CheckOption(unsigned option) { + return (options_ & (1 << option)); + } + + ClTracer(const ClTracer& copy) = delete; + ClTracer& operator=(const ClTracer& copy) = delete; + + private: + ClTracer(unsigned options) + : options_(options) { + start_time_ = std::chrono::steady_clock::now(); + + if (CheckOption(CLT_CHROME_DEVICE_TIMELINE) || + CheckOption(CLT_CHROME_CALL_LOGGING)) { + OpenTraceFile(); + } + } + + static uint64_t CalculateTotalTime(const ClApiCollector* collector) { + PTI_ASSERT(collector != nullptr); + uint64_t total_time = 0; + + const ClFunctionInfoMap& function_info_map = collector->GetFunctionInfoMap(); + if (function_info_map.size() != 0) { + for (auto& value : function_info_map) { + total_time += value.second.total_time; + } + } + + return total_time; + } + + static uint64_t CalculateTotalTime(const ClKernelCollector* collector) { + PTI_ASSERT(collector != nullptr); + uint64_t total_time = 0; + + const ClKernelInfoMap& kernel_info_map = collector->GetKernelInfoMap(); + if (kernel_info_map.size() != 0) { + for (auto& value : kernel_info_map) { + total_time += value.second.total_time; + } + } + + return total_time; + } + + static void PrintBackendTable( + const ClApiCollector* collector, const char* device_type) { + PTI_ASSERT(collector != nullptr); + PTI_ASSERT(device_type != nullptr); + + uint64_t total_duration = CalculateTotalTime(collector); + if (total_duration > 0) { + std::cerr << std::endl; + std::cerr << "== " << device_type << " Backend: ==" << std::endl; + std::cerr << std::endl; + + const ClFunctionInfoMap& function_info_map = collector->GetFunctionInfoMap(); + PTI_ASSERT(function_info_map.size() > 0); + ClApiCollector::PrintFunctionsTable(function_info_map); + } + } + + static void PrintBackendTable( + const ClKernelCollector* collector, const char* device_type) { + PTI_ASSERT(collector != nullptr); + PTI_ASSERT(device_type != nullptr); + + uint64_t total_duration = CalculateTotalTime(collector); + if (total_duration > 0) { + std::cerr << std::endl; + std::cerr << "== " << device_type << " Backend: ==" << std::endl; + std::cerr << std::endl; + + const ClKernelInfoMap& kernel_info_map = collector->GetKernelInfoMap(); + PTI_ASSERT(kernel_info_map.size() > 0); + ClKernelCollector::PrintKernelsTable(kernel_info_map); + } + } + + template + void ReportTiming( + const Collector* cpu_collector, + const Collector* gpu_collector, + const char* type) { + PTI_ASSERT (cpu_collector != nullptr || gpu_collector != nullptr); + + std::string cpu_title = + std::string("Total ") + std::string(type) + + " Time for CPU backend (ns): "; + std::string gpu_title = + std::string("Total ") + std::string(type) + + " Time for GPU backend (ns): "; + size_t title_width = std::max(cpu_title.size(), gpu_title.size()); + const size_t time_width = 20; + + std::cerr << std::endl; + std::cerr << "=== " << type << " Timing Results: ===" << std::endl; + std::cerr << std::endl; + std::cerr << std::setw(title_width) << "Total Execution Time (ns): " << + std::setw(time_width) << total_execution_time_ << std::endl; + + if (cpu_collector != nullptr) { + std::cerr << std::setw(title_width) << cpu_title << + std::setw(time_width) << CalculateTotalTime(cpu_collector) << + std::endl; + } + if (gpu_collector != nullptr) { + std::cerr << std::setw(title_width) << gpu_title << + std::setw(time_width) << CalculateTotalTime(gpu_collector) << + std::endl; + } + + if (cpu_collector != nullptr) { + PrintBackendTable(cpu_collector, "CPU"); + } + if (gpu_collector != nullptr) { + PrintBackendTable(gpu_collector, "GPU"); + } + + std::cerr << std::endl; + } + + void Report() { + if (CheckOption(CLT_HOST_TIMING)) { + ReportTiming(cpu_api_collector_, gpu_api_collector_, "API"); + } + if (CheckOption(CLT_DEVICE_TIMING)) { + ReportTiming(cpu_kernel_collector_, gpu_kernel_collector_, "Device"); + } + std::cerr << std::endl; + } + + static void DeviceTimelineCallback( + void* data, void* queue, const std::string& name, + uint64_t queued, uint64_t submitted, + uint64_t started, uint64_t ended) { + std::stringstream stream; + stream << "Device Timeline (queue: " << queue << + "): " << name << " [ns] = " << + queued << " (queued) " << + submitted << " (submit) " << + started << " (start) " << + ended << " (end)" << std::endl; + std::cerr << stream.str(); + } + + void OpenTraceFile() { + chrome_trace_.open(kChromeTraceFileName); + PTI_ASSERT(chrome_trace_.is_open()); + chrome_trace_ << "[" << std::endl; + chrome_trace_ << + "{\"ph\":\"M\", \"name\":\"process_name\", \"pid\":" << + utils::GetPid() << ", \"tid\":0, \"args\":{\"name\":\"" << + utils::GetExecutableName() << "\"}}," << std::endl; + } + + void CloseTraceFile() { + PTI_ASSERT(chrome_trace_.is_open()); + chrome_trace_.close(); + std::cerr << "Timeline was stored to " << + kChromeTraceFileName << std::endl; + } + + static void ChromeTimelineCallback( + void* data, void* queue, const std::string& name, + uint64_t queued, uint64_t submitted, + uint64_t started, uint64_t ended) { + ClTracer* tracer = reinterpret_cast(data); + PTI_ASSERT(tracer != nullptr); + + std::stringstream stream; + stream << "{\"ph\":\"X\", \"pid\":" << utils::GetPid() << + ", \"tid\":" << reinterpret_cast(queue) << + ", \"name\":\"" << name << + "\", \"ts\": " << started / NSEC_IN_USEC << + ", \"dur\":" << (ended - started) / NSEC_IN_USEC << + "}," << std::endl; + tracer->chrome_trace_ << stream.str(); + } + + static void DeviceAndChromeTimelineCallback( + void* data, void* queue, const std::string& name, + uint64_t queued, uint64_t submitted, + uint64_t started, uint64_t ended) { + DeviceTimelineCallback(data, queue, name, queued, submitted, started, ended); + ChromeTimelineCallback(data, queue, name, queued, submitted, started, ended); + } + + static void ChromeLoggingCallback( + void* data, const std::string& name, + uint64_t started, uint64_t ended) { + ClTracer* tracer = reinterpret_cast(data); + PTI_ASSERT(tracer != nullptr); + + std::stringstream stream; + stream << "{\"ph\":\"X\", \"pid\":" << + utils::GetPid() << ", \"tid\":" << utils::GetTid() << + ", \"name\":\"" << name << + "\", \"ts\": " << started / NSEC_IN_USEC << + ", \"dur\":" << (ended - started) / NSEC_IN_USEC << + "}," << std::endl; + tracer->chrome_trace_ << stream.str(); + } + + private: + unsigned options_; + + std::chrono::time_point start_time_; + uint64_t total_execution_time_ = 0; + + ClApiCollector* cpu_api_collector_ = nullptr; + ClApiCollector* gpu_api_collector_ = nullptr; + + ClKernelCollector* cpu_kernel_collector_ = nullptr; + ClKernelCollector* gpu_kernel_collector_ = nullptr; + + std::ofstream chrome_trace_; +}; + +#endif // PTI_SAMPLES_CL_TRACER_CL_TRACER_H_ \ No newline at end of file diff --git a/samples/cl_tracer/tool.cc b/samples/cl_tracer/tool.cc new file mode 100644 index 0000000..74ff36b --- /dev/null +++ b/samples/cl_tracer/tool.cc @@ -0,0 +1,130 @@ +//============================================================== +// Copyright (C) Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= + +#include + +#include "cl_tracer.h" +#include "cl_utils.h" + +static ClTracer* tracer = nullptr; + +extern "C" +#if defined(_WIN32) +__declspec(dllexport) +#endif +void Usage() { + std::cout << + "Usage: ./cl_tracer[.exe] [options] " << + std::endl; + std::cout << "Options:" << std::endl; + std::cout << + "--call-logging [-c] Trace host API calls" << + std::endl; + std::cout << + "--host-timing [-h] Report host API execution time" << + std::endl; + std::cout << + "--device-timing [-d] Report kernels execution time" << + std::endl; + std::cout << + "--device-timeline [-t] Trace device activities" << + std::endl; + std::cout << + "--chrome-device-timeline Dump device activities to JSON file" << + std::endl; + std::cout << + "--chrome-call-logging Dump host API calls to JSON file" << + std::endl; +} + +extern "C" +#if defined(_WIN32) +__declspec(dllexport) +#endif +int ParseArgs(int argc, char* argv[]) { + int app_index = 1; + for (int i = 1; i < argc; ++i) { + if (strcmp(argv[i], "--call-logging") == 0 || + strcmp(argv[i], "-c") == 0) { + utils::SetEnv("CLT_CallLogging=1"); + ++app_index; + } else if (strcmp(argv[i], "--host-timing") == 0 || + strcmp(argv[i], "-h") == 0) { + utils::SetEnv("CLT_HostTiming=1"); + ++app_index; + } else if (strcmp(argv[i], "--device-timing") == 0 || + strcmp(argv[i], "-d") == 0) { + utils::SetEnv("CLT_DeviceTiming=1"); + ++app_index; + } else if (strcmp(argv[i], "--device-timeline") == 0 || + strcmp(argv[i], "-t") == 0) { + utils::SetEnv("CLT_DeviceTimeline=1"); + ++app_index; + } else if (strcmp(argv[i], "--chrome-device-timeline") == 0) { + utils::SetEnv("CLT_ChromeDeviceTimeline=1"); + ++app_index; + } else if (strcmp(argv[i], "--chrome-call-logging") == 0) { + utils::SetEnv("CLT_ChromeCallLogging=1"); + ++app_index; + } else { + break; + } + } + return app_index; +} + +extern "C" +#if defined(_WIN32) +__declspec(dllexport) +#endif +void SetToolEnv() {} + +static unsigned ReadArgs() { + std::string value; + unsigned options = 0; + + value = utils::GetEnv("CLT_CallLogging"); + if (!value.empty() && value == "1") { + options |= (1 << CLT_CALL_LOGGING); + } + + value = utils::GetEnv("CLT_HostTiming"); + if (!value.empty() && value == "1") { + options |= (1 << CLT_HOST_TIMING); + } + + value = utils::GetEnv("CLT_DeviceTiming"); + if (!value.empty() && value == "1") { + options |= (1 << CLT_DEVICE_TIMING); + } + + value = utils::GetEnv("CLT_DeviceTimeline"); + if (!value.empty() && value == "1") { + options |= (1 << CLT_DEVICE_TIMELINE); + } + + value = utils::GetEnv("CLT_ChromeDeviceTimeline"); + if (!value.empty() && value == "1") { + options |= (1 << CLT_CHROME_DEVICE_TIMELINE); + } + + value = utils::GetEnv("CLT_ChromeCallLogging"); + if (!value.empty() && value == "1") { + options |= (1 << CLT_CHROME_CALL_LOGGING); + } + + return options; +} + +void EnableProfiling() { + tracer = ClTracer::Create(ReadArgs()); +} + +void DisableProfiling() { + if (tracer != nullptr) { + delete tracer; + } +} \ No newline at end of file diff --git a/samples/dpc_gemm/CMakeLists.txt b/samples/dpc_gemm/CMakeLists.txt index dc7acaf..b5c1f83 100644 --- a/samples/dpc_gemm/CMakeLists.txt +++ b/samples/dpc_gemm/CMakeLists.txt @@ -2,12 +2,16 @@ include("../build_utils/CMakeLists.txt") SetRequiredCMakeVersion() cmake_minimum_required(VERSION ${REQUIRED_CMAKE_VERSION}) +if(WIN32) + set(CMAKE_CXX_COMPILER "dpcpp.exe") +else() + set(CMAKE_CXX_COMPILER "dpcpp") +endif() + project(PTI_Samples_DPC_GEMM CXX) SetCompilerFlags() SetBuildType() -CheckForDPCCompiler() - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -gline-tables-only") add_executable(dpc_gemm main.cc) diff --git a/samples/dpc_gemm/README.md b/samples/dpc_gemm/README.md index 2594ae7..94dc8c1 100644 --- a/samples/dpc_gemm/README.md +++ b/samples/dpc_gemm/README.md @@ -26,13 +26,13 @@ Total execution time: 0.373728 sec ## Build and Run ### Linux -Run the following commands to build the sample (make sure you use Intel(R) oneAPI DPC++ Compiler for building): +Run the following commands to build the sample ((make sure you have oneAPI DPC++ Compiler in `PATH` for building)): ```sh source /setvars.sh cd /samples/dpc_gemm mkdir build cd build -CXX=dpcpp cmake -DCMAKE_BUILD_TYPE=Release .. +cmake -DCMAKE_BUILD_TYPE=Release .. make ``` Use this command line to run the application: @@ -40,7 +40,7 @@ Use this command line to run the application: ./dpc_gemm [cpu|gpu|host] [matrix_size] [repeat_count] ``` ### Windows (manual build) -Use Microsoft* Visual Studio x64 command prompt to run the following commands and build the sample: +Use Microsoft* Visual Studio x64 command prompt to run the following commands and build the sample (make sure you have oneAPI DPC++ Compiler in `PATH` for building): ```sh \setvars.bat cd \samples\dpc_gemm diff --git a/samples/dpc_gemm/main.cc b/samples/dpc_gemm/main.cc index 97f1afc..ea2c857 100644 --- a/samples/dpc_gemm/main.cc +++ b/samples/dpc_gemm/main.cc @@ -1,5 +1,5 @@ //============================================================== -// Copyright © 2020 Intel Corporation +// Copyright (C) Intel Corporation // // SPDX-License-Identifier: MIT // ============================================================= diff --git a/samples/dpc_info/CMakeLists.txt b/samples/dpc_info/CMakeLists.txt index b5a48d6..dea7650 100644 --- a/samples/dpc_info/CMakeLists.txt +++ b/samples/dpc_info/CMakeLists.txt @@ -2,12 +2,16 @@ include("../build_utils/CMakeLists.txt") SetRequiredCMakeVersion() cmake_minimum_required(VERSION ${REQUIRED_CMAKE_VERSION}) +if(WIN32) + set(CMAKE_CXX_COMPILER "dpcpp.exe") +else() + set(CMAKE_CXX_COMPILER "dpcpp") +endif() + project(PTI_Samples_DPC_Info CXX) SetCompilerFlags() SetBuildType() -CheckForDPCCompiler() - add_executable(dpc_info main.cc) if(CMAKE_INCLUDE_PATH) target_include_directories(dpc_info diff --git a/samples/dpc_info/README.md b/samples/dpc_info/README.md index 9513221..33be194 100644 --- a/samples/dpc_info/README.md +++ b/samples/dpc_info/README.md @@ -52,13 +52,13 @@ The following modes are implemented: ## Build and Run ### Linux -Run the following commands to build the sample: +Run the following commands to build the sample (make sure you have oneAPI DPC++ Compiler in `PATH` for building): ```sh source /setvars.sh cd /samples/dpc_info mkdir build cd build -CXX=dpcpp cmake -DCMAKE_BUILD_TYPE=Release .. +cmake -DCMAKE_BUILD_TYPE=Release .. make ``` Use this command line to run the utility: @@ -66,7 +66,7 @@ Use this command line to run the utility: ./dpc_info [-l|-a] ``` ### Windows (manual build) -Use Microsoft* Visual Studio x64 command prompt to run the following commands and build the sample: +Use Microsoft* Visual Studio x64 command prompt to run the following commands and build the sample (make sure you have oneAPI DPC++ Compiler in `PATH` for building): ```sh \setvars.bat cd \samples\dpc_info diff --git a/samples/gpu_info/main.cc b/samples/gpu_info/main.cc index e2779cd..6e30855 100644 --- a/samples/gpu_info/main.cc +++ b/samples/gpu_info/main.cc @@ -1,5 +1,5 @@ //============================================================== -// Copyright © 2019-2020 Intel Corporation +// Copyright (C) Intel Corporation // // SPDX-License-Identifier: MIT // ============================================================= diff --git a/samples/gpu_inst_count/gpu_inst_count_collector.h b/samples/gpu_inst_count/gpu_inst_count_collector.h index 2697b15..eae2c7c 100644 --- a/samples/gpu_inst_count/gpu_inst_count_collector.h +++ b/samples/gpu_inst_count/gpu_inst_count_collector.h @@ -1,5 +1,5 @@ //============================================================== -// Copyright © 2020 Intel Corporation +// Copyright (C) Intel Corporation // // SPDX-License-Identifier: MIT // ============================================================= diff --git a/samples/gpu_inst_count/tool.cc b/samples/gpu_inst_count/tool.cc index 35fe46c..6921444 100644 --- a/samples/gpu_inst_count/tool.cc +++ b/samples/gpu_inst_count/tool.cc @@ -1,5 +1,5 @@ //============================================================== -// Copyright © 2020 Intel Corporation +// Copyright (C) Intel Corporation // // SPDX-License-Identifier: MIT // ============================================================= diff --git a/samples/gpu_perfmon_read/gpu_perfmon_collector.h b/samples/gpu_perfmon_read/gpu_perfmon_collector.h index f22d518..ad64f40 100644 --- a/samples/gpu_perfmon_read/gpu_perfmon_collector.h +++ b/samples/gpu_perfmon_read/gpu_perfmon_collector.h @@ -1,5 +1,5 @@ //============================================================== -// Copyright © 2020 Intel Corporation +// Copyright (C) Intel Corporation // // SPDX-License-Identifier: MIT // ============================================================= diff --git a/samples/gpu_perfmon_read/tool.cc b/samples/gpu_perfmon_read/tool.cc index 5b62082..847bbcd 100644 --- a/samples/gpu_perfmon_read/tool.cc +++ b/samples/gpu_perfmon_read/tool.cc @@ -1,5 +1,5 @@ //============================================================== -// Copyright © 2020 Intel Corporation +// Copyright (C) Intel Corporation // // SPDX-License-Identifier: MIT // ============================================================= diff --git a/samples/gpu_perfmon_set/main.cc b/samples/gpu_perfmon_set/main.cc index 3033792..e3c9ffe 100644 --- a/samples/gpu_perfmon_set/main.cc +++ b/samples/gpu_perfmon_set/main.cc @@ -1,5 +1,5 @@ //============================================================== -// Copyright © 2020 Intel Corporation +// Copyright (C) Intel Corporation // // SPDX-License-Identifier: MIT // ============================================================= diff --git a/samples/loader/init.cc b/samples/loader/init.cc index 414b3ec..577d385 100644 --- a/samples/loader/init.cc +++ b/samples/loader/init.cc @@ -1,5 +1,5 @@ //============================================================== -// Copyright © 2020 Intel Corporation +// Copyright (C) Intel Corporation // // SPDX-License-Identifier: MIT // ============================================================= diff --git a/samples/loader/loader.cc b/samples/loader/loader.cc index 7755fe0..449d5ce 100644 --- a/samples/loader/loader.cc +++ b/samples/loader/loader.cc @@ -1,5 +1,5 @@ //============================================================== -// Copyright © 2020 Intel Corporation +// Copyright (C) Intel Corporation // // SPDX-License-Identifier: MIT // ============================================================= diff --git a/samples/loader/loader.h b/samples/loader/loader.h index 1d9f5e9..e331fae 100644 --- a/samples/loader/loader.h +++ b/samples/loader/loader.h @@ -1,5 +1,5 @@ //============================================================== -// Copyright © 2020 Intel Corporation +// Copyright (C) Intel Corporation // // SPDX-License-Identifier: MIT // ============================================================= diff --git a/samples/loader/tool.h b/samples/loader/tool.h index b84fa07..dec6648 100644 --- a/samples/loader/tool.h +++ b/samples/loader/tool.h @@ -1,5 +1,5 @@ //============================================================== -// Copyright © 2019-2020 Intel Corporation +// Copyright (C) Intel Corporation // // SPDX-License-Identifier: MIT // ============================================================= diff --git a/samples/omp_gemm/CMakeLists.txt b/samples/omp_gemm/CMakeLists.txt index 5bf23a1..8c8ad69 100644 --- a/samples/omp_gemm/CMakeLists.txt +++ b/samples/omp_gemm/CMakeLists.txt @@ -2,12 +2,16 @@ include("../build_utils/CMakeLists.txt") SetRequiredCMakeVersion() cmake_minimum_required(VERSION ${REQUIRED_CMAKE_VERSION}) +if(WIN32) + set(CMAKE_CXX_COMPILER "icl.exe") +else() + set(CMAKE_CXX_COMPILER "icpx") +endif() + project(PTI_Samples_OpenMP_GEMM CXX) SetCompilerFlags() SetBuildType() -CheckForIntelCompiler() - if(WIN32) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Qnextgen /Qopenmp") else() diff --git a/samples/omp_gemm/README.md b/samples/omp_gemm/README.md index deb7aee..ff807a7 100644 --- a/samples/omp_gemm/README.md +++ b/samples/omp_gemm/README.md @@ -26,13 +26,13 @@ Total execution time: 5.37699 sec ## Build and Run ### Linux -Run the following commands to build the sample (make sure you use Intel(R) C++ Compiler for building): +Run the following commands to build the sample (make sure you have Intel(R) C++ Compiler in `PATH` for building): ```sh source /setvars.sh cd /samples/omp_gemm mkdir build cd build -CXX=icpx cmake -DCMAKE_BUILD_TYPE=Release .. +cmake -DCMAKE_BUILD_TYPE=Release .. make ``` Use this command line to run the application: @@ -40,13 +40,12 @@ Use this command line to run the application: ./omp_gemm [cpu|gpu] [matrix_size] [repeat_count] ``` ### Windows -Use Microsoft* Visual Studio x64 command prompt to run the following commands and build the sample: +Use Microsoft* Visual Studio x64 command prompt to run the following commands and build the sample (make sure you have Intel(R) C++ Compiler in `PATH` for building): ```sh \setvars.bat cd \samples\omp_gemm mkdir build cd build -set CXX=icl cmake -G "NMake Makefiles" -DCMAKE_BUILD_TYPE=Release .. nmake ``` diff --git a/samples/omp_gemm/main.cc b/samples/omp_gemm/main.cc index 3dd6a28..07b91b4 100644 --- a/samples/omp_gemm/main.cc +++ b/samples/omp_gemm/main.cc @@ -1,5 +1,5 @@ //============================================================== -// Copyright © 2020 Intel Corporation +// Copyright (C) Intel Corporation // // SPDX-License-Identifier: MIT // ============================================================= diff --git a/samples/omp_hot_regions/CMakeLists.txt b/samples/omp_hot_regions/CMakeLists.txt index 11eb06d..b87f4c3 100644 --- a/samples/omp_hot_regions/CMakeLists.txt +++ b/samples/omp_hot_regions/CMakeLists.txt @@ -2,11 +2,16 @@ include("../build_utils/CMakeLists.txt") SetRequiredCMakeVersion() cmake_minimum_required(VERSION ${REQUIRED_CMAKE_VERSION}) +if(WIN32) + set(CMAKE_CXX_COMPILER "icl.exe") +else() + set(CMAKE_CXX_COMPILER "icpx") +endif() + project(PTI_Samples_OpenMP_Hot_Regions CXX) SetCompilerFlags() SetBuildType() -CheckForIntelCompiler() CheckForOMPTHeaders() add_library(omp_hot_regions SHARED tool.cc) diff --git a/samples/omp_hot_regions/README.md b/samples/omp_hot_regions/README.md index df0816d..51cd2d0 100644 --- a/samples/omp_hot_regions/README.md +++ b/samples/omp_hot_regions/README.md @@ -26,13 +26,13 @@ Total Region Time (ns): 186811670 ## Build and Run ### Linux -Run the following commands to build the sample (make sure you use Intel(R) C++ Compiler for building): +Run the following commands to build the sample (make sure you have Intel(R) C++ Compiler in `PATH` for building): ```sh source /setvars.sh cd /samples/omp_hot_regions mkdir build cd build -CXX=icpx cmake -DCMAKE_BUILD_TYPE=Release .. +cmake -DCMAKE_BUILD_TYPE=Release .. make ``` Use this command line to run the tool: @@ -44,13 +44,12 @@ One may use [omp_gemm](../omp_gemm) as target application: OMP_TOOL_LIBRARIES=./libomp_hot_regions.so ../../omp_gemm/build/omp_gemm ``` ### Windows -Use Microsoft* Visual Studio x64 command prompt to run the following commands and build the sample (make sure you use Intel(R) C++ Compiler for building): +Use Microsoft* Visual Studio x64 command prompt to run the following commands and build the sample (make sure you have Intel(R) C++ Compiler in `PATH` for building): ```sh \setvars.sh cd \samples\omp_hot_regions mkdir build cd build -set CXX=icl cmake -G "NMake Makefiles" -DCMAKE_BUILD_TYPE=Release .. nmake ``` diff --git a/samples/omp_hot_regions/omp_region_collector.h b/samples/omp_hot_regions/omp_region_collector.h index 6265550..6e639b5 100644 --- a/samples/omp_hot_regions/omp_region_collector.h +++ b/samples/omp_hot_regions/omp_region_collector.h @@ -1,5 +1,5 @@ //============================================================== -// Copyright © 2020 Intel Corporation +// Copyright (C) Intel Corporation // // SPDX-License-Identifier: MIT // ============================================================= diff --git a/samples/omp_hot_regions/tool.cc b/samples/omp_hot_regions/tool.cc index 763b399..afa1719 100644 --- a/samples/omp_hot_regions/tool.cc +++ b/samples/omp_hot_regions/tool.cc @@ -1,5 +1,5 @@ //============================================================== -// Copyright © 2020 Intel Corporation +// Copyright (C) Intel Corporation // // SPDX-License-Identifier: MIT // ============================================================= diff --git a/samples/utils/cl_tracer.h b/samples/utils/cl_api_tracer.h similarity index 92% rename from samples/utils/cl_tracer.h rename to samples/utils/cl_api_tracer.h index 6dd4aea..e593904 100644 --- a/samples/utils/cl_tracer.h +++ b/samples/utils/cl_api_tracer.h @@ -1,20 +1,20 @@ //============================================================== -// Copyright © 2019-2020 Intel Corporation +// Copyright (C) Intel Corporation // // SPDX-License-Identifier: MIT // ============================================================= -#ifndef PTI_SAMPLES_UTILS_CL_TRACER_H_ -#define PTI_SAMPLES_UTILS_CL_TRACER_H_ +#ifndef PTI_SAMPLES_UTILS_CL_API_TRACER_H_ +#define PTI_SAMPLES_UTILS_CL_API_TRACER_H_ #include #include "pti_assert.h" -class ClTracer { +class ClApiTracer { public: - ClTracer(cl_device_id device, cl_tracing_callback callback, - void* user_data) { + ClApiTracer( + cl_device_id device, cl_tracing_callback callback, void* user_data) { PTI_ASSERT(device != nullptr); bool loaded = LoadTracingFunctions(device); @@ -29,7 +29,7 @@ class ClTracer { } } - ~ClTracer() { + ~ClApiTracer() { if (handle_ != nullptr) { cl_int status = CL_SUCCESS; status = clDestroyTracingHandle_(handle_); @@ -141,4 +141,4 @@ class ClTracer { decltype(clGetTracingStateINTEL)* clGetTracingState_ = nullptr; }; -#endif // PTI_SAMPLES_UTILS_CL_TRACER_H_ \ No newline at end of file +#endif // PTI_SAMPLES_UTILS_CL_API_TRACER_H_ \ No newline at end of file diff --git a/samples/utils/cl_utils.h b/samples/utils/cl_utils.h index 2242868..3263e48 100644 --- a/samples/utils/cl_utils.h +++ b/samples/utils/cl_utils.h @@ -1,5 +1,5 @@ //============================================================== -// Copyright © 2019-2020 Intel Corporation +// Copyright (C) Intel Corporation // // SPDX-License-Identifier: MIT // ============================================================= @@ -54,6 +54,10 @@ inline cl_device_id GetIntelDevice(cl_device_type type) { break; } } + + if (target != nullptr) { + break; + } } return target; @@ -76,13 +80,25 @@ inline std::string GetDeviceName(cl_device_id device) { PTI_ASSERT(device != nullptr); char name[MAX_STR_SIZE] = { 0 }; - cl_int status = clGetDeviceInfo(device, CL_DEVICE_NAME, - MAX_STR_SIZE, name, nullptr); + cl_int status = clGetDeviceInfo( + device, CL_DEVICE_NAME, MAX_STR_SIZE, name, nullptr); PTI_ASSERT(status == CL_SUCCESS); return name; } +inline cl_device_type GetDeviceType(cl_device_id device) { + PTI_ASSERT(device != nullptr); + + cl_device_type type = CL_DEVICE_TYPE_ALL; + cl_int status = clGetDeviceInfo( + device, CL_DEVICE_TYPE, sizeof(type), &type, nullptr); + PTI_ASSERT(status == CL_SUCCESS); + PTI_ASSERT(type != CL_DEVICE_TYPE_ALL); + + return type; +} + inline cl_program GetProgram(cl_kernel kernel) { PTI_ASSERT(kernel != nullptr); @@ -213,28 +229,44 @@ inline cl_device_id GetDevice(cl_command_queue queue) { return device; } -inline cl_ulong GetEventStartTime(cl_event event) { +inline cl_ulong GetEventTimestamp(cl_event event, cl_profiling_info info) { PTI_ASSERT(event != nullptr); cl_int status = CL_SUCCESS; cl_ulong start = 0; - status = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, - sizeof(cl_ulong), &start, nullptr); + status = clGetEventProfilingInfo( + event, info, sizeof(cl_ulong), &start, nullptr); PTI_ASSERT(status == CL_SUCCESS); return start; } -inline cl_ulong GetEventEndTime(cl_event event) { - PTI_ASSERT(event != nullptr); - - cl_int status = CL_SUCCESS; - cl_ulong end = 0; +inline cl_ulong GetGpuTimestamp() { + cl_ulong timestamp = 0; +#if defined(_WIN32) + BOOL success = QueryPerformanceCounter(×tamp); + PTI_ASSERT(success); +#else + timespec tp{0, 0}; + int status = clock_gettime(CLOCK_MONOTONIC_RAW, &tp); + PTI_ASSERT(status == 0); + timestamp = NSEC_IN_SEC * tp.tv_sec + tp.tv_nsec; +#endif + return timestamp; +} - status = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, - sizeof(cl_ulong), &end, nullptr); - PTI_ASSERT(status == CL_SUCCESS); - return end; +inline cl_ulong GetCpuTimestamp() { + cl_ulong timestamp = 0; +#if defined(_WIN32) + BOOL success = QueryPerformanceCounter(×tamp); + PTI_ASSERT(success); +#else + timespec tp{0, 0}; + int status = clock_gettime(CLOCK_MONOTONIC, &tp); + PTI_ASSERT(status == 0); + timestamp = NSEC_IN_SEC * tp.tv_sec + tp.tv_nsec; +#endif + return timestamp; } } // namespace cl diff --git a/samples/utils/debug_abbrev_parser.h b/samples/utils/debug_abbrev_parser.h index 00a85ce..a69597a 100644 --- a/samples/utils/debug_abbrev_parser.h +++ b/samples/utils/debug_abbrev_parser.h @@ -1,5 +1,5 @@ //============================================================== -// Copyright © 2020 Intel Corporation +// Copyright (C) Intel Corporation // // SPDX-License-Identifier: MIT // ============================================================= diff --git a/samples/utils/debug_info_parser.h b/samples/utils/debug_info_parser.h index e281514..4b494fa 100644 --- a/samples/utils/debug_info_parser.h +++ b/samples/utils/debug_info_parser.h @@ -1,5 +1,5 @@ //============================================================== -// Copyright © 2020 Intel Corporation +// Copyright (C) Intel Corporation // // SPDX-License-Identifier: MIT // ============================================================= diff --git a/samples/utils/debug_line_parser.h b/samples/utils/debug_line_parser.h index 6cf5d1e..acb85f6 100644 --- a/samples/utils/debug_line_parser.h +++ b/samples/utils/debug_line_parser.h @@ -1,5 +1,5 @@ //============================================================== -// Copyright © 2019-2020 Intel Corporation +// Copyright (C) Intel Corporation // // SPDX-License-Identifier: MIT // ============================================================= diff --git a/samples/utils/dwarf.h b/samples/utils/dwarf.h index 2ada222..0813446 100644 --- a/samples/utils/dwarf.h +++ b/samples/utils/dwarf.h @@ -1,5 +1,5 @@ //============================================================== -// Copyright © 2019-2020 Intel Corporation +// Copyright (C) Intel Corporation // // SPDX-License-Identifier: MIT // ============================================================= diff --git a/samples/utils/dwarf_state_machine.h b/samples/utils/dwarf_state_machine.h index 7a4444f..9eabc73 100644 --- a/samples/utils/dwarf_state_machine.h +++ b/samples/utils/dwarf_state_machine.h @@ -1,5 +1,5 @@ //============================================================== -// Copyright © 2019-2020 Intel Corporation +// Copyright (C) Intel Corporation // // SPDX-License-Identifier: MIT // ============================================================= diff --git a/samples/utils/elf.h b/samples/utils/elf.h index d05380a..a11bc75 100644 --- a/samples/utils/elf.h +++ b/samples/utils/elf.h @@ -1,5 +1,5 @@ //============================================================== -// Copyright © 2019-2020 Intel Corporation +// Copyright (C) Intel Corporation // // SPDX-License-Identifier: MIT // ============================================================= diff --git a/samples/utils/elf_parser.h b/samples/utils/elf_parser.h index 64e5889..b05a610 100644 --- a/samples/utils/elf_parser.h +++ b/samples/utils/elf_parser.h @@ -1,5 +1,5 @@ //============================================================== -// Copyright © 2019-2020 Intel Corporation +// Copyright (C) Intel Corporation // // SPDX-License-Identifier: MIT // ============================================================= diff --git a/samples/utils/gen_binary_decoder.h b/samples/utils/gen_binary_decoder.h index ebb00ac..7dc83b0 100644 --- a/samples/utils/gen_binary_decoder.h +++ b/samples/utils/gen_binary_decoder.h @@ -1,5 +1,5 @@ //============================================================== -// Copyright © 2019-2020 Intel Corporation +// Copyright (C) Intel Corporation // // SPDX-License-Identifier: MIT // ============================================================= diff --git a/samples/utils/gen_symbols_decoder.h b/samples/utils/gen_symbols_decoder.h index f5fc6df..df7c7fc 100644 --- a/samples/utils/gen_symbols_decoder.h +++ b/samples/utils/gen_symbols_decoder.h @@ -1,5 +1,5 @@ //============================================================== -// Copyright © 2019-2020 Intel Corporation +// Copyright (C) Intel Corporation // // SPDX-License-Identifier: MIT // ============================================================= diff --git a/samples/utils/gtpin_utils.h b/samples/utils/gtpin_utils.h index 2e188c8..1014e86 100644 --- a/samples/utils/gtpin_utils.h +++ b/samples/utils/gtpin_utils.h @@ -1,5 +1,5 @@ //============================================================== -// Copyright © 2020 Intel Corporation +// Copyright (C) Intel Corporation // // SPDX-License-Identifier: MIT // ============================================================= diff --git a/samples/utils/i915_utils.h b/samples/utils/i915_utils.h index b9d2b77..84f7754 100644 --- a/samples/utils/i915_utils.h +++ b/samples/utils/i915_utils.h @@ -1,5 +1,5 @@ //============================================================== -// Copyright © 2020 Intel Corporation +// Copyright (C) Intel Corporation // // SPDX-License-Identifier: MIT // ============================================================= diff --git a/samples/utils/igc_binary_decoder.h b/samples/utils/igc_binary_decoder.h index 5111cc4..a0d0594 100644 --- a/samples/utils/igc_binary_decoder.h +++ b/samples/utils/igc_binary_decoder.h @@ -1,5 +1,5 @@ //============================================================== -// Copyright © 2020 Intel Corporation +// Copyright (C) Intel Corporation // // SPDX-License-Identifier: MIT // ============================================================= diff --git a/samples/utils/leb128.h b/samples/utils/leb128.h index d01791c..9ca8131 100644 --- a/samples/utils/leb128.h +++ b/samples/utils/leb128.h @@ -1,5 +1,5 @@ //============================================================== -// Copyright © 2019-2020 Intel Corporation +// Copyright (C) Intel Corporation // // SPDX-License-Identifier: MIT // ============================================================= diff --git a/samples/utils/metric_device.h b/samples/utils/metric_device.h index fc9719c..9289b9c 100644 --- a/samples/utils/metric_device.h +++ b/samples/utils/metric_device.h @@ -1,5 +1,5 @@ //============================================================== -// Copyright © 2019-2020 Intel Corporation +// Copyright (C) Intel Corporation // // SPDX-License-Identifier: MIT // ============================================================= diff --git a/samples/utils/metric_utils.h b/samples/utils/metric_utils.h index 7d0ce21..ade7784 100644 --- a/samples/utils/metric_utils.h +++ b/samples/utils/metric_utils.h @@ -1,5 +1,5 @@ //============================================================== -// Copyright © 2019-2020 Intel Corporation +// Copyright (C) Intel Corporation // // SPDX-License-Identifier: MIT // ============================================================= diff --git a/samples/utils/pti_assert.h b/samples/utils/pti_assert.h index 210c558..76909d2 100644 --- a/samples/utils/pti_assert.h +++ b/samples/utils/pti_assert.h @@ -1,5 +1,5 @@ //============================================================== -// Copyright © 2020 Intel Corporation +// Copyright (C) Intel Corporation // // SPDX-License-Identifier: MIT // ============================================================= diff --git a/samples/utils/shared_library.h b/samples/utils/shared_library.h index b67d8f8..83683f1 100644 --- a/samples/utils/shared_library.h +++ b/samples/utils/shared_library.h @@ -1,5 +1,5 @@ //============================================================== -// Copyright © 2019-2020 Intel Corporation +// Copyright (C) Intel Corporation // // SPDX-License-Identifier: MIT // ============================================================= diff --git a/samples/utils/trace_guard.cc b/samples/utils/trace_guard.cc new file mode 100644 index 0000000..6ad620d --- /dev/null +++ b/samples/utils/trace_guard.cc @@ -0,0 +1,9 @@ +//============================================================== +// Copyright (C) Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= + +#include "trace_guard.h" + +thread_local int TraceGuard::inactive_count_ = 0; \ No newline at end of file diff --git a/samples/utils/trace_guard.h b/samples/utils/trace_guard.h new file mode 100644 index 0000000..25cefa0 --- /dev/null +++ b/samples/utils/trace_guard.h @@ -0,0 +1,31 @@ +//============================================================== +// Copyright (C) Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= + +#ifndef PTI_SAMPLES_UTILS_TRACE_GUARD_H_ +#define PTI_SAMPLES_UTILS_TRACE_GUARD_H_ + +#include "pti_assert.h" + +class TraceGuard { + public: + TraceGuard() { + ++inactive_count_; + } + + ~TraceGuard() { + PTI_ASSERT(inactive_count_ > 0); + --inactive_count_; + } + + static bool Inactive() { + return inactive_count_ > 0; + } + + private: + static thread_local int inactive_count_; +}; + +#endif // PTI_SAMPLES_UTILS_TRACE_GUARD_H_ \ No newline at end of file diff --git a/samples/utils/utils.h b/samples/utils/utils.h index 27c4c3f..4a6397c 100644 --- a/samples/utils/utils.h +++ b/samples/utils/utils.h @@ -1,5 +1,5 @@ //============================================================== -// Copyright © 2019-2020 Intel Corporation +// Copyright (C) Intel Corporation // // SPDX-License-Identifier: MIT // ============================================================= diff --git a/samples/utils/ze_utils.h b/samples/utils/ze_utils.h index a108d3e..efca8df 100644 --- a/samples/utils/ze_utils.h +++ b/samples/utils/ze_utils.h @@ -1,5 +1,5 @@ //============================================================== -// Copyright © 2019-2020 Intel Corporation +// Copyright (C) Intel Corporation // // SPDX-License-Identifier: MIT // ============================================================= diff --git a/samples/ze_debug_info/tool.cc b/samples/ze_debug_info/tool.cc index 0fbb65d..ff30553 100644 --- a/samples/ze_debug_info/tool.cc +++ b/samples/ze_debug_info/tool.cc @@ -1,5 +1,5 @@ //============================================================== -// Copyright © 2020 Intel Corporation +// Copyright (C) Intel Corporation // // SPDX-License-Identifier: MIT // ============================================================= diff --git a/samples/ze_debug_info/ze_debug_info_collector.h b/samples/ze_debug_info/ze_debug_info_collector.h index baf0879..0d712cc 100644 --- a/samples/ze_debug_info/ze_debug_info_collector.h +++ b/samples/ze_debug_info/ze_debug_info_collector.h @@ -1,5 +1,5 @@ //============================================================== -// Copyright © 2020 Intel Corporation +// Copyright (C) Intel Corporation // // SPDX-License-Identifier: MIT // ============================================================= diff --git a/samples/ze_gemm/main.cc b/samples/ze_gemm/main.cc index 07b08d9..684171c 100644 --- a/samples/ze_gemm/main.cc +++ b/samples/ze_gemm/main.cc @@ -1,5 +1,5 @@ //============================================================== -// Copyright © 2020 Intel Corporation +// Copyright (C) Intel Corporation // // SPDX-License-Identifier: MIT // ============================================================= diff --git a/samples/ze_hot_functions/gen_tracing_callbacks.py b/samples/ze_hot_functions/gen_tracing_callbacks.py index 7878464..baa2bd1 100644 --- a/samples/ze_hot_functions/gen_tracing_callbacks.py +++ b/samples/ze_hot_functions/gen_tracing_callbacks.py @@ -1,3 +1,9 @@ +#============================================================== +# Copyright (C) Intel Corporation +# +# SPDX-License-Identifier: MIT +# ============================================================= + import os import sys import re @@ -259,17 +265,39 @@ def gen_api(f, func_list, group_map): f.write("}\n") f.write("\n") +def gen_structure_type_converter(f, enum_map): + struct_type_enum = {} + for name in enum_map["ze_structure_type_t"]: + struct_type_enum[name] = int(enum_map["ze_structure_type_t"][name]) + struct_type_enum = sorted(struct_type_enum.items(), key=lambda x:x[1]) + assert "ze_structure_type_t" in enum_map + f.write("static const char* GetStructureTypeString(unsigned structure_type) {\n") + f.write(" switch (structure_type) {\n") + for name, value in struct_type_enum: + f.write(" case " + name + ":\n") + f.write(" return \"" + name + "\";\n") + f.write(" default:\n") + f.write(" break;\n") + f.write(" }\n") + f.write(" return \"UNKNOWN\";\n") + f.write("}\n") + f.write("\n") + def gen_result_converter(f, enum_map): + result_enum = {} + for name in enum_map["ze_result_t"]: + result_enum[name] = int(enum_map["ze_result_t"][name]) + result_enum = sorted(result_enum.items(), key=lambda x:x[1]) assert "ze_result_t" in enum_map f.write("static const char* GetResultString(unsigned result) {\n") f.write(" switch (result) {\n") - for name, value in enum_map["ze_result_t"].items(): - f.write(" case " + value + ":\n") + for name, value in result_enum: + f.write(" case " + name + ":\n") f.write(" return \"" + name + "\";\n") f.write(" default:\n") f.write(" break;\n") f.write(" }\n") - f.write(" return \"\";\n") + f.write(" return \"UNKNOWN\";\n") f.write("}\n") f.write("\n") @@ -278,61 +306,195 @@ def gen_enum(f, enum_map, enum_name, param_name): f.write(" switch (" + param_name + ") {\n") for name, value in enum_map[enum_name].items(): f.write(" case " + value + ":\n") - f.write(" std::cerr << \"" + name + "\";\n") + f.write(" stream << \"" + name + "\";\n") f.write(" break;\n") f.write(" default:\n") - f.write(" std::cerr << \"\";\n") + f.write(" stream << \"\";\n") f.write(" break;\n") f.write(" }\n") def gen_enter_callback(f, func, params, enum_map): f.write(" PTI_ASSERT(global_user_data != nullptr);\n") - f.write(" ZeApiCollector* collector = reinterpret_cast(global_user_data);\n") + f.write(" ZeApiCollector* collector =\n") + f.write(" reinterpret_cast(global_user_data);\n") f.write("\n") f.write(" uint64_t& start_time = *reinterpret_cast(instance_user_data);\n") f.write(" start_time = collector->GetTimestamp();\n") f.write(" if (collector->call_tracing_) {\n") - f.write(" std::cerr << \">>>> [\" << start_time << \"] \";\n") - f.write(" std::cerr << \"" + func + "\" << \":\";\n") + f.write(" std::stringstream stream;\n") + f.write(" stream << \">>>> [\" << start_time << \"] \";\n") + f.write(" stream << \"" + func + "\" << \":\";\n") for name, type in params: if type == "ze_ipc_mem_handle_t" or type == "ze_ipc_event_pool_handle_t": - f.write(" std::cerr << \" " + name + " = \" << (params->p" + name + ")->data;\n") + f.write(" stream << \" " + name + " = \" << (params->p" + name + ")->data;\n") else: - f.write(" std::cerr << \" " + name + " = \" << *(params->p" + name + ");\n") - if type in enum_map: - f.write(" std::cerr << \" (\";\n") - gen_enum(f, enum_map, type, "*(params->p" + name + ")") - f.write(" std::cerr << \")\";\n") - elif name.find("ph") == 0 or name.find("pptr") == 0 or name.find("pCount") == 0: - f.write(" if (*(params->p" + name + ") != nullptr) {\n") - if type == "ze_ipc_mem_handle_t*" or type == "ze_ipc_event_pool_handle_t*": - f.write(" std::cerr << \" (" + name[1:] + " = \" << (*(params->p" + name + "))->data << \")\";\n") - else: - f.write(" std::cerr << \" (" + name[1:] + " = \" << **(params->p" + name + ") << \")\";\n") + if type.find("char*") >= 0 and type.find("char*") == len(type) - len("char*"): + f.write(" if (*(params->p" + name + ") == nullptr) {\n") + f.write(" stream << \" " + name + " = \" << \"0\";\n") + f.write(" } else if (strlen(*(params->p" + name +")) == 0) {\n") + f.write(" stream << \" " + name + " = \\\"\\\"\";\n") + f.write(" } else {\n") + f.write(" stream << \" " + name + " = \\\"\" << *(params->p" + name + ") << \"\\\"\";\n") f.write(" }\n") - elif type.find("ze_command_queue_desc_t*") >= 0: - f.write(" if (*(params->p" + name + ") != nullptr) {\n") - f.write(" std::cerr << \" {\" << (*(params->p" + name + "))->stype << \" \";\n") - f.write(" std::cerr << (*(params->p" + name + "))->pNext << \" \";\n") - f.write(" std::cerr << (*(params->p" + name + "))->ordinal << \" \";\n") - f.write(" std::cerr << (*(params->p" + name + "))->index << \" \";\n") - f.write(" std::cerr << (*(params->p" + name + "))->flags << \" \";\n") - f.write(" std::cerr << (*(params->p" + name + "))->mode << \" \";\n") - f.write(" std::cerr << (*(params->p" + name + "))->priority << \"}\";\n") - f.write(" }\n") - elif type.find("ze_kernel_desc_t*") >= 0: - f.write(" if (*(params->p" + name + ") != nullptr) {\n") - f.write(" std::cerr << \" {\" << (*(params->p" + name + "))->stype << \" \";\n") - f.write(" std::cerr << (*(params->p" + name + "))->pNext << \" \";\n") - f.write(" std::cerr << (*(params->p" + name + "))->flags << \" \";\n") - f.write(" std::cerr << (*(params->p" + name + "))->pKernelName << \"}\";\n") - f.write(" }\n") - f.write(" std::cerr << std::endl;\n") + else: + f.write(" stream << \" " + name + " = \" << *(params->p" + name + ");\n") + if name.find("ph") == 0 or name.find("pptr") == 0 or name.find("pCount") == 0: + f.write(" if (*(params->p" + name + ") != nullptr) {\n") + if type == "ze_ipc_mem_handle_t*" or type == "ze_ipc_event_pool_handle_t*": + f.write(" stream << \" (" + name[1:] + " = \" << (*(params->p" + name + "))->data << \")\";\n") + else: + f.write(" stream << \" (" + name[1:] + " = \" << **(params->p" + name + ") << \")\";\n") + f.write(" }\n") + elif type.find("ze_event_pool_desc_t*") >= 0: + f.write(" if (*(params->p" + name + ") != nullptr) {\n") + f.write(" stream << \" {\" << GetStructureTypeString((*(params->p" + name + "))->stype)\n") + f.write(" << \"(0x\" << std::hex << (*(params->p" + name + "))->stype << std::dec << \") \";\n") + f.write(" stream << (*(params->p" + name + "))->pNext << \" \";\n") + f.write(" stream << (*(params->p" + name + "))->flags << \" \";\n") + f.write(" stream << (*(params->p" + name + "))->count << \"}\";\n") + f.write(" }\n") + elif type.find("ze_command_queue_desc_t*") >= 0: + f.write(" if (*(params->p" + name + ") != nullptr) {\n") + f.write(" stream << \" {\" << GetStructureTypeString((*(params->p" + name + "))->stype)\n") + f.write(" << \"(0x\" << std::hex << (*(params->p" + name + "))->stype << std::dec << \") \";\n") + f.write(" stream << (*(params->p" + name + "))->pNext << \" \";\n") + f.write(" stream << (*(params->p" + name + "))->ordinal << \" \";\n") + f.write(" stream << (*(params->p" + name + "))->index << \" \";\n") + f.write(" stream << (*(params->p" + name + "))->flags << \" \";\n") + f.write(" stream << (*(params->p" + name + "))->mode << \" \";\n") + f.write(" stream << (*(params->p" + name + "))->priority << \"}\";\n") + f.write(" }\n") + elif type.find("ze_kernel_desc_t*") >= 0: + f.write(" if (*(params->p" + name + ") != nullptr) {\n") + f.write(" stream << \" {\" << GetStructureTypeString((*(params->p" + name + "))->stype)\n") + f.write(" << \"(0x\" << std::hex << (*(params->p" + name + "))->stype << std::dec << \") \";\n") + f.write(" stream << (*(params->p" + name + "))->pNext << \" \";\n") + f.write(" stream << (*(params->p" + name + "))->flags << \" \";\n") + f.write(" if ((*(params->p" + name + "))->pKernelName == nullptr) {\n") + f.write(" stream << \"0\";\n") + f.write(" } else if (strlen((*(params->p" + name + "))->pKernelName) == 0) {\n") + f.write(" stream << \" " + name + " = \\\"\\\"\";\n") + f.write(" } else {\n") + f.write(" stream << (*(params->p" + name + "))->pKernelName << \"}\";\n") + f.write(" }\n") + f.write(" }\n") + elif type.find("ze_device_mem_alloc_desc_t*") >= 0: + f.write(" if (*(params->p" + name + ") != nullptr) {\n") + f.write(" stream << \" {\" << GetStructureTypeString((*(params->p" + name + "))->stype)\n") + f.write(" << \"(0x\" << std::hex << (*(params->p" + name + "))->stype << std::dec << \") \";\n") + f.write(" stream << (*(params->p" + name + "))->pNext << \" \";\n") + f.write(" stream << (*(params->p" + name + "))->flags << \" \";\n") + f.write(" stream << (*(params->p" + name + "))->ordinal << \"}\";\n") + f.write(" }\n") + elif type.find("ze_context_desc_t*") >= 0: + f.write(" if (*(params->p" + name + ") != nullptr) {\n") + f.write(" stream << \" {\" << GetStructureTypeString((*(params->p" + name + "))->stype)\n") + f.write(" << \"(0x\" << std::hex << (*(params->p" + name + "))->stype << std::dec << \") \";\n") + f.write(" stream << (*(params->p" + name + "))->pNext << \" \";\n") + f.write(" stream << (*(params->p" + name + "))->flags << \"}\";\n") + f.write(" }\n") + elif type.find("ze_command_list_desc_t*") >= 0: + f.write(" if (*(params->p" + name + ") != nullptr) {\n") + f.write(" stream << \" {\" << GetStructureTypeString((*(params->p" + name + "))->stype)\n") + f.write(" << \"(0x\" << std::hex << (*(params->p" + name + "))->stype << std::dec << \") \";\n") + f.write(" stream << (*(params->p" + name + "))->pNext << \" \";\n") + f.write(" stream << (*(params->p" + name + "))->commandQueueGroupOrdinal << \" \";\n") + f.write(" stream << (*(params->p" + name + "))->flags << \"}\";\n") + f.write(" }\n") + elif type.find("ze_event_desc_t*") >= 0: + f.write(" if (*(params->p" + name + ") != nullptr) {\n") + f.write(" stream << \" {\" << GetStructureTypeString((*(params->p" + name + "))->stype)\n") + f.write(" << \"(0x\" << std::hex << (*(params->p" + name + "))->stype << std::dec << \") \";\n") + f.write(" stream << (*(params->p" + name + "))->pNext << \" \";\n") + f.write(" stream << (*(params->p" + name + "))->index << \" \";\n") + f.write(" stream << (*(params->p" + name + "))->signal << \" \";\n") + f.write(" stream << (*(params->p" + name + "))->wait << \"}\";\n") + f.write(" }\n") + elif type.find("ze_fence_desc_t*") >= 0: + f.write(" if (*(params->p" + name + ") != nullptr) {\n") + f.write(" stream << \" {\" << GetStructureTypeString((*(params->p" + name + "))->stype)\n") + f.write(" << \"(0x\" << std::hex << (*(params->p" + name + "))->stype << std::dec << \") \";\n") + f.write(" stream << (*(params->p" + name + "))->pNext << \" \";\n") + f.write(" stream << (*(params->p" + name + "))->flags << \"}\";\n") + f.write(" }\n") + elif type.find("ze_image_desc_t*") >= 0: + f.write(" if (*(params->p" + name + ") != nullptr) {\n") + f.write(" stream << \" {\" << GetStructureTypeString((*(params->p" + name + "))->stype)\n") + f.write(" << \"(0x\" << std::hex << (*(params->p" + name + "))->stype << std::dec << \") \";\n") + f.write(" stream << (*(params->p" + name + "))->pNext << \" \";\n") + f.write(" stream << (*(params->p" + name + "))->flags << \" \";\n") + f.write(" stream << (*(params->p" + name + "))->type << \" \";\n") + f.write(" stream << \"{\" << (*(params->p" + name +"))->format.layout << \" \";\n") + f.write(" stream << (*(params->p" + name + "))->format.type << \" \";\n") + f.write(" stream << (*(params->p" + name + "))->format.x << \" \";\n") + f.write(" stream << (*(params->p" + name + "))->format.y << \" \";\n") + f.write(" stream << (*(params->p" + name + "))->format.z << \" \";\n") + f.write(" stream << (*(params->p" + name + "))->format.w << \"}\" << \" \";\n") + f.write(" stream << (*(params->p" + name + "))->width << \" \";\n") + f.write(" stream << (*(params->p" + name + "))->height << \" \";\n") + f.write(" stream << (*(params->p" + name + "))->depth << \" \";\n") + f.write(" stream << (*(params->p" + name + "))->arraylevels << \" \";\n") + f.write(" stream << (*(params->p" + name + "))->miplevels << \"}\";\n") + f.write(" }\n") + elif type.find("ze_host_mem_alloc_desc_t*") >= 0: + f.write(" if (*(params->p" + name + ") != nullptr) {\n") + f.write(" stream << \" {\" << GetStructureTypeString((*(params->p" + name + "))->stype)\n") + f.write(" << \"(0x\" << std::hex << (*(params->p" + name + "))->stype << std::dec << \") \";\n") + f.write(" stream << (*(params->p" + name + "))->pNext << \" \";\n") + f.write(" stream << (*(params->p" + name + "))->flags << \"}\";\n") + f.write(" }\n") + elif type.find("ze_external_memory_export_desc_t*") >= 0: + f.write(" if (*(params->p" + name + ") != nullptr) {\n") + f.write(" stream << \" {\" << GetStructureTypeString((*(params->p" + name + "))->stype)\n") + f.write(" << \"(0x\" << std::hex << (*(params->p" + name + "))->stype << std::dec << \") \";\n") + f.write(" stream << (*(params->p" + name + "))->pNext << \" \";\n") + f.write(" stream << (*(params->p" + name + "))->flags << \"}\";\n") + f.write(" }\n") + elif type.find("ze_module_desc_t*") >= 0: + f.write(" if (*(params->p" + name + ") != nullptr) {\n") + f.write(" stream << \" {\" << GetStructureTypeString((*(params->p" + name + "))->stype)\n") + f.write(" << \"(0x\" << std::hex << (*(params->p" + name + "))->stype << std::dec << \") \";\n") + f.write(" stream << (*(params->p" + name + "))->pNext << \" \";\n") + f.write(" stream << (*(params->p" + name + "))->format << \" \";\n") + f.write(" stream << (*(params->p" + name + "))->inputSize << \" \";\n") + f.write(" stream << static_cast((*(params->p" + name + "))->pInputModule) << \" \";\n") + f.write(" if ((*(params->p" + name + ")) -> pBuildFlags != nullptr) \n") + f.write(" stream << (*(params->p" + name + "))->pBuildFlags << \" \";\n") + f.write(" else stream << 0 << \" \";\n") + f.write(" stream << (*(params->p" + name + "))->pConstants << \"}\";\n") + f.write(" }\n") + elif type.find("ze_sampler_desc_t*") >= 0: + f.write(" if (*(params->p" + name + ") != nullptr) {\n") + f.write(" stream << \" {\" << GetStructureTypeString((*(params->p" + name + "))->stype)\n") + f.write(" << \"(0x\" << std::hex << (*(params->p" + name + "))->stype << std::dec << \") \";\n") + f.write(" stream << (*(params->p" + name + "))->pNext << \" \";\n") + f.write(" stream << (*(params->p" + name + "))->addressMode << \" \";\n") + f.write(" stream << (*(params->p" + name + "))->filterMode << \" \";\n") + f.write(" stream << static_cast((*(params->p" + name + "))->isNormalized) << \"}\";\n") + f.write(" }\n") + elif type.find("ze_physical_mem_desc_t*") >= 0: + f.write(" if (*(params->p" + name + ") != nullptr) {\n") + f.write(" stream << \" {\" << GetStructureTypeString((*(params->p" + name + "))->stype)\n") + f.write(" << \"(0x\" << std::hex << (*(params->p" + name + "))->stype << std::dec << \") \";\n") + f.write(" stream << (*(params->p" + name + "))->pNext << \" \";\n") + f.write(" stream << (*(params->p" + name + "))->flags << \" \";\n") + f.write(" stream << (*(params->p" + name + "))->size << \"}\";\n") + f.write(" }\n") + elif type.find("ze_raytracing_mem_alloc_ext_desc_t*") >= 0: + f.write(" if (*(params->p" + name + ") != nullptr) {\n") + f.write(" stream << \" {\" << GetStructureTypeString((*(params->p" + name + "))->stype)\n") + f.write(" << \"(0x\" << std::hex << (*(params->p" + name + "))->stype << std::dec << \") \";\n") + f.write(" stream << (*(params->p" + name + "))->pNext << \" \";\n") + f.write(" stream << (*(params->p" + name + "))->flags << \"}\";\n") + f.write(" }\n") + f.write(" stream << std::endl;\n") + f.write(" std::cerr << stream.str();\n") f.write(" }\n") def gen_exit_callback(f, func, params, enum_map): f.write(" PTI_ASSERT(global_user_data != nullptr);\n") - f.write(" ZeApiCollector* collector = reinterpret_cast(global_user_data);\n") + f.write(" ZeApiCollector* collector =\n") + f.write(" reinterpret_cast(global_user_data);\n") f.write(" uint64_t end_time = collector->GetTimestamp();\n") f.write("\n") f.write(" uint64_t& start_time = *reinterpret_cast(instance_user_data);\n") @@ -341,24 +503,26 @@ def gen_exit_callback(f, func, params, enum_map): f.write(" uint64_t time = end_time - start_time;\n") f.write(" collector->AddFunctionTime(\"" + func + "\", time);\n") f.write(" if (collector->call_tracing_) {\n") - f.write(" std::cerr << \"<<<< [\" << end_time << \"] \";\n") - f.write(" std::cerr << \"" + func + "\" << \" [\" << time << \" ns]\";\n") + f.write(" std::stringstream stream;\n") + f.write(" stream << \"<<<< [\" << end_time << \"] \";\n") + f.write(" stream << \"" + func + "\" << \" [\" << time << \" ns]\";\n") for name, type in params: if name.find("ph") == 0 or name.find("pptr") == 0 or name.find("pCount") == 0: f.write(" if (*(params->p" + name + ") != nullptr) {\n") if type == "ze_ipc_mem_handle_t*" or type == "ze_ipc_event_pool_handle_t*": - f.write(" std::cerr << \" " + name[1:] + " = \" << (*(params->p" + name + "))->data << \"\";\n") + f.write(" stream << \" " + name[1:] + " = \" << (*(params->p" + name + "))->data << \"\";\n") else: - f.write(" std::cerr << \" " + name[1:] + " = \" << **(params->p" + name + ") << \"\";\n") + f.write(" stream << \" " + name[1:] + " = \" << **(params->p" + name + ") << \"\";\n") f.write(" }\n") - f.write(" std::cerr << \" -> \" << GetResultString(result) << \n") - f.write(" \" (\" << result << \")\" << std::endl;\n") + f.write(" stream << \" -> \" << GetResultString(result) << \n") + f.write(" \"(0x\" << result << \")\" << std::endl;\n") + f.write(" std::cerr << stream.str();\n") f.write(" }\n") f.write("\n") f.write(" if (collector->callback_ != nullptr) {\n") - f.write(" collector->callback_(collector->callback_data_,\n") - f.write(" \"" + func + "\",\n") - f.write(" start_time, end_time);\n") + f.write(" collector->callback_(\n") + f.write(" collector->callback_data_, \"" + func + "\",\n") + f.write(" start_time, end_time);\n") f.write(" }\n") def gen_callbacks(f, func_list, group_map, param_map, enum_map): @@ -414,6 +578,7 @@ def main(): enum_map = get_enum_map(l0_path) gen_result_converter(dst_file, enum_map) + gen_structure_type_converter(dst_file, enum_map) gen_callbacks(dst_file, func_list, group_map, param_map, enum_map) gen_api(dst_file, func_list, group_map) diff --git a/samples/ze_hot_functions/tool.cc b/samples/ze_hot_functions/tool.cc index 18c366b..f795205 100644 --- a/samples/ze_hot_functions/tool.cc +++ b/samples/ze_hot_functions/tool.cc @@ -1,5 +1,5 @@ //============================================================== -// Copyright © 2020 Intel Corporation +// Copyright (C) Intel Corporation // // SPDX-License-Identifier: MIT // ============================================================= @@ -44,7 +44,7 @@ static void PrintResults() { std::chrono::duration time = end - start; PTI_ASSERT(collector != nullptr); - const FunctionInfoMap& function_info_map = collector->GetFunctionInfoMap(); + const ZeFunctionInfoMap& function_info_map = collector->GetFunctionInfoMap(); if (function_info_map.size() == 0) { return; } diff --git a/samples/ze_hot_functions/ze_api_collector.h b/samples/ze_hot_functions/ze_api_collector.h index 8e7fbaf..551f23c 100644 --- a/samples/ze_hot_functions/ze_api_collector.h +++ b/samples/ze_hot_functions/ze_api_collector.h @@ -1,5 +1,5 @@ //============================================================== -// Copyright © 2020 Intel Corporation +// Copyright (C) Intel Corporation // // SPDX-License-Identifier: MIT // ============================================================= @@ -19,20 +19,20 @@ #include "utils.h" #include "ze_utils.h" -struct Function { +struct ZeFunction { uint64_t total_time; uint64_t min_time; uint64_t max_time; uint64_t call_count; - bool operator>(const Function& r) const { + bool operator>(const ZeFunction& r) const { if (total_time != r.total_time) { return total_time > r.total_time; } return call_count > r.call_count; } - bool operator!=(const Function& r) const { + bool operator!=(const ZeFunction& r) const { if (total_time == r.total_time) { return call_count != r.call_count; } @@ -40,19 +40,19 @@ struct Function { } }; -using FunctionInfoMap = std::map; -using FunctionTimePoint = std::chrono::time_point; +using ZeFunctionInfoMap = std::map; +using ZeFunctionTimePoint = std::chrono::time_point; -typedef void (*OnFunctionFinishCallback)( +typedef void (*OnZeFunctionFinishCallback)( void* data, const std::string& name, uint64_t started, uint64_t ended); class ZeApiCollector { public: // User Interface static ZeApiCollector* Create( - FunctionTimePoint base_time = std::chrono::steady_clock::now(), + ZeFunctionTimePoint base_time = std::chrono::steady_clock::now(), bool call_tracing = false, - OnFunctionFinishCallback callback = nullptr, + OnZeFunctionFinishCallback callback = nullptr, void* callback_data = nullptr) { ZeApiCollector* collector = new ZeApiCollector(base_time, call_tracing, @@ -66,7 +66,7 @@ class ZeApiCollector { status = zelTracerCreate(&tracer_desc, &tracer); if (status != ZE_RESULT_SUCCESS || tracer == nullptr) { - std::cerr << "[WARNING] Unable to create Level Zero tracer" << std::endl; + std::cerr << "[WARNING] Unable to create L0 tracer" << std::endl; delete collector; return nullptr; } @@ -87,12 +87,12 @@ class ZeApiCollector { PTI_ASSERT(status == ZE_RESULT_SUCCESS); } - const FunctionInfoMap& GetFunctionInfoMap() const { + const ZeFunctionInfoMap& GetFunctionInfoMap() const { return function_info_map_; } - static void PrintFunctionsTable(const FunctionInfoMap& function_info_map) { - std::set< std::pair, + static void PrintFunctionsTable(const ZeFunctionInfoMap& function_info_map) { + std::set< std::pair, utils::Comparator > sorted_list( function_info_map.begin(), function_info_map.end()); @@ -155,7 +155,7 @@ class ZeApiCollector { if (function_info_map_.count(name) == 0) { function_info_map_[name] = {time, time, time, 1}; } else { - Function& function = function_info_map_[name]; + ZeFunction& function = function_info_map_[name]; function.total_time += time; if (time < function.min_time) { function.min_time = time; @@ -168,27 +168,24 @@ class ZeApiCollector { } private: // Implementation Details - ZeApiCollector(FunctionTimePoint base_time, - bool call_tracing, - OnFunctionFinishCallback callback, - void* callback_data) - : base_time_(base_time), - call_tracing_(call_tracing), - callback_(callback), - callback_data_(callback_data) {} + ZeApiCollector( + ZeFunctionTimePoint base_time, bool call_tracing, + OnZeFunctionFinishCallback callback, void* callback_data) + : base_time_(base_time), call_tracing_(call_tracing), + callback_(callback), callback_data_(callback_data) {} #include // Auto-generated callbacks private: // Data zel_tracer_handle_t tracer_ = nullptr; - FunctionInfoMap function_info_map_; + ZeFunctionInfoMap function_info_map_; std::mutex lock_; - FunctionTimePoint base_time_; + ZeFunctionTimePoint base_time_; bool call_tracing_ = false; - OnFunctionFinishCallback callback_ = nullptr; + OnZeFunctionFinishCallback callback_ = nullptr; void* callback_data_ = nullptr; static const uint32_t kFunctionLength = 10; diff --git a/samples/ze_hot_kernels/tool.cc b/samples/ze_hot_kernels/tool.cc index 41ca09a..c96d68a 100644 --- a/samples/ze_hot_kernels/tool.cc +++ b/samples/ze_hot_kernels/tool.cc @@ -1,5 +1,5 @@ //============================================================== -// Copyright © 2020 Intel Corporation +// Copyright (C) Intel Corporation // // SPDX-License-Identifier: MIT // ============================================================= @@ -49,7 +49,7 @@ static void PrintResults() { std::chrono::duration time = end - start; PTI_ASSERT(collector != nullptr); - const KernelInfoMap& kernel_info_map = collector->GetKernelInfoMap(); + const ZeKernelInfoMap& kernel_info_map = collector->GetKernelInfoMap(); if (kernel_info_map.size() == 0) { return; } diff --git a/samples/ze_hot_kernels/ze_kernel_collector.h b/samples/ze_hot_kernels/ze_kernel_collector.h index a14ea9c..93eb369 100644 --- a/samples/ze_hot_kernels/ze_kernel_collector.h +++ b/samples/ze_hot_kernels/ze_kernel_collector.h @@ -1,5 +1,5 @@ //============================================================== -// Copyright © 2020 Intel Corporation +// Copyright (C) Intel Corporation // // SPDX-License-Identifier: MIT // ============================================================= @@ -19,17 +19,18 @@ #include "utils.h" #include "ze_utils.h" -struct KernelInstance { +struct ZeKernelInstance { std::string name; size_t simd_width; size_t bytes_transferred; + void* queue; ze_event_pool_handle_t event_pool; ze_event_handle_t event; uint64_t append_time; uint64_t submit_time; }; -struct KernelInfo { +struct ZeKernelInfo { uint64_t total_time; uint64_t min_time; uint64_t max_time; @@ -37,14 +38,14 @@ struct KernelInfo { size_t simd_width; size_t bytes_transferred; - bool operator>(const KernelInfo& r) const { + bool operator>(const ZeKernelInfo& r) const { if (total_time != r.total_time) { return total_time > r.total_time; } return call_count > r.call_count; } - bool operator!=(const KernelInfo& r) const { + bool operator!=(const ZeKernelInfo& r) const { if (total_time == r.total_time) { return call_count != r.call_count; } @@ -52,26 +53,26 @@ struct KernelInfo { } }; -struct KernelInterval { +struct ZeKernelInterval { std::string name; uint64_t start; uint64_t end; }; -struct CommandListInfo { +struct ZeCommandListInfo { ze_context_handle_t context; - std::vector kernel_list; + std::vector kernel_list; bool immediate; }; -using KernelInfoMap = std::map; -using KernelIntervalList = std::vector; -using KernelNameMap = std::map; -using KernelTimePoint = std::chrono::time_point; -using CommandListMap = std::map; +using ZeKernelInfoMap = std::map; +using ZeKernelIntervalList = std::vector; +using ZeKernelNameMap = std::map; +using ZeKernelTimePoint = std::chrono::time_point; +using ZeCommandListMap = std::map; -typedef void (*OnKernelFinishCallback)( - void* data, const std::string& name, +typedef void (*OnZeKernelFinishCallback)( + void* data, void* queue, const std::string& name, uint64_t appended, uint64_t submitted, uint64_t started, uint64_t ended); @@ -79,8 +80,8 @@ class ZeKernelCollector { public: // Interface static ZeKernelCollector* Create( - KernelTimePoint base_time = std::chrono::steady_clock::now(), - OnKernelFinishCallback callback = nullptr, + ZeKernelTimePoint base_time = std::chrono::steady_clock::now(), + OnZeKernelFinishCallback callback = nullptr, void* callback_data = nullptr) { ZeKernelCollector* collector = new ZeKernelCollector( base_time, callback, callback_data); @@ -101,8 +102,8 @@ class ZeKernelCollector { return collector; } - static void PrintKernelsTable(const KernelInfoMap& kernel_info_map) { - std::set< std::pair, + static void PrintKernelsTable(const ZeKernelInfoMap& kernel_info_map) { + std::set< std::pair, utils::Comparator > sorted_list( kernel_info_map.begin(), kernel_info_map.end()); @@ -168,20 +169,22 @@ class ZeKernelCollector { PTI_ASSERT(status == ZE_RESULT_SUCCESS); } - const KernelInfoMap& GetKernelInfoMap() const { + const ZeKernelInfoMap& GetKernelInfoMap() const { return kernel_info_map_; } - const KernelIntervalList& GetKernelIntervalList() const { + const ZeKernelIntervalList& GetKernelIntervalList() const { return kernel_interval_list_; } private: // Implementation - ZeKernelCollector(KernelTimePoint base_time, - OnKernelFinishCallback callback, - void* callback_data) - : base_time_(base_time), callback_(callback), + ZeKernelCollector( + ZeKernelTimePoint base_time, + OnZeKernelFinishCallback callback, + void* callback_data) + : base_time_(base_time), + callback_(callback), callback_data_(callback_data), timer_frequency_(utils::i915::GetGpuTimerFrequency()) { PTI_ASSERT(timer_frequency_ > 0); @@ -226,7 +229,7 @@ class ZeKernelCollector { epilogue_callbacks.CommandList.pfnCreateCb = OnExitCommandListCreate; epilogue_callbacks.CommandList.pfnCreateImmediateCb = - OnExitCommandListImmediateCreate; + OnExitCommandListCreateImmediate; epilogue_callbacks.CommandList.pfnDestroyCb = OnExitCommandListDestroy; epilogue_callbacks.CommandList.pfnResetCb = @@ -279,16 +282,17 @@ class ZeKernelCollector { } void AddKernelInstance(ze_command_list_handle_t command_list, - const KernelInstance& instance) { + const ZeKernelInstance& instance) { PTI_ASSERT(command_list != nullptr); const std::lock_guard lock(lock_); kernel_instance_list_.push_back(instance); - KernelInstance* kernel_instance = &kernel_instance_list_.back(); + ZeKernelInstance* kernel_instance = &kernel_instance_list_.back(); PTI_ASSERT(command_list_map_.count(command_list) == 1); - CommandListInfo& command_list_info = command_list_map_[command_list]; + ZeCommandListInfo& command_list_info = command_list_map_[command_list]; if (command_list_info.immediate) { kernel_instance->submit_time = kernel_instance->append_time; + kernel_instance->queue = command_list; } command_list_info.kernel_list.push_back(kernel_instance); } @@ -307,7 +311,7 @@ class ZeKernelCollector { } } - void ProcessInstance(const KernelInstance& instance) { + void ProcessInstance(const ZeKernelInstance& instance) { ze_result_t status = ZE_RESULT_SUCCESS; status = zeEventQueryStatus(instance.event); PTI_ASSERT(status == ZE_RESULT_SUCCESS); @@ -362,7 +366,11 @@ class ZeKernelCollector { cpu_end = cpu_start + time; } - callback_(callback_data_, instance.name, + PTI_ASSERT(instance.queue != nullptr); + PTI_ASSERT(!instance.name.empty()); + PTI_ASSERT(instance.append_time > 0); + PTI_ASSERT(instance.submit_time > 0); + callback_(callback_data_, instance.queue, instance.name, instance.append_time, instance.submit_time, cpu_start, cpu_end); } @@ -403,7 +411,7 @@ class ZeKernelCollector { kernel_info_map_[name] = { time, time, time, 1, simd_width, bytes_transferred}; } else { - KernelInfo& kernel = kernel_info_map_[name]; + ZeKernelInfo& kernel = kernel_info_map_[name]; kernel.total_time += time; if (time > kernel.max_time) { kernel.max_time = time; @@ -429,7 +437,7 @@ class ZeKernelCollector { PTI_ASSERT(context != nullptr); const std::lock_guard lock(lock_); PTI_ASSERT(command_list_map_.count(command_list) == 0); - command_list_map_[command_list] = {context, std::vector(), immediate}; + command_list_map_[command_list] = {context, std::vector(), immediate}; } void RemoveCommandList(ze_command_list_handle_t command_list) { @@ -446,16 +454,18 @@ class ZeKernelCollector { command_list_map_[command_list].kernel_list.clear(); } - void SetKernelInstanceSubmitTime(ze_command_list_handle_t command_list, - uint64_t submit_time) { + void UpdateKernelInstances( + ze_command_list_handle_t command_list, + ze_command_queue_handle_t queue, uint64_t submit_time) { PTI_ASSERT(command_list != nullptr); const std::lock_guard lock(lock_); PTI_ASSERT(command_list_map_.count(command_list) == 1); - CommandListInfo& command_list_info = command_list_map_[command_list]; + ZeCommandListInfo& command_list_info = command_list_map_[command_list]; if (!command_list_info.immediate) { - std::vector& kernel_list = + std::vector& kernel_list = command_list_info.kernel_list; for (size_t i = 0; i < kernel_list.size(); ++i) { + kernel_list[i]->queue = queue; kernel_list[i]->submit_time = submit_time; } } @@ -466,7 +476,7 @@ class ZeKernelCollector { PTI_ASSERT(command_list != nullptr); const std::lock_guard lock(lock_); PTI_ASSERT(command_list_map_.count(command_list) == 1); - CommandListInfo& command_list_info = command_list_map_[command_list]; + ZeCommandListInfo& command_list_info = command_list_map_[command_list]; return command_list_info.context; } @@ -592,7 +602,7 @@ class ZeKernelCollector { return; } - KernelInstance* instance = new KernelInstance; + ZeKernelInstance* instance = new ZeKernelInstance; PTI_ASSERT(instance != nullptr); instance->name = collector->GetKernelName(*(params->phKernel)); PTI_ASSERT(!instance->name.empty()); @@ -604,6 +614,7 @@ class ZeKernelCollector { instance->bytes_transferred = 0; instance->append_time = collector->GetTimestamp(); instance->submit_time = 0; + instance->queue = nullptr; if (*(params->phSignalEvent) == nullptr) { ze_context_handle_t context = @@ -629,7 +640,7 @@ class ZeKernelCollector { return; } - KernelInstance* instance = new KernelInstance; + ZeKernelInstance* instance = new ZeKernelInstance; instance->name = "zeCommandListAppendMemoryCopy"; instance->bytes_transferred = *(params->psize); instance->simd_width = 0; @@ -654,7 +665,7 @@ class ZeKernelCollector { ze_result_t result) { PTI_ASSERT(command_list != nullptr); - KernelInstance* instance = static_cast(*instance_data); + ZeKernelInstance* instance = static_cast(*instance_data); if (instance == nullptr) { return; } @@ -706,7 +717,7 @@ class ZeKernelCollector { } } - static void OnExitCommandListImmediateCreate( + static void OnExitCommandListCreateImmediate( ze_command_list_create_immediate_params_t* params, ze_result_t result, void* global_data, void** instance_data) { if (result == ZE_RESULT_SUCCESS) { @@ -756,7 +767,8 @@ class ZeKernelCollector { uint32_t command_list_count = *params->pnumCommandLists; ze_command_list_handle_t* command_lists = *params->pphCommandLists; for (uint32_t i = 0; i < command_list_count; ++i) { - collector->SetKernelInstanceSubmitTime(command_lists[i], submit_time); + collector->UpdateKernelInstances( + command_lists[i], *(params->phCommandQueue), submit_time); } } @@ -786,20 +798,20 @@ class ZeKernelCollector { zel_tracer_handle_t tracer_ = nullptr; uint64_t timer_frequency_ = 0; - KernelTimePoint base_time_; + ZeKernelTimePoint base_time_; - OnKernelFinishCallback callback_ = nullptr; + OnZeKernelFinishCallback callback_ = nullptr; void* callback_data_ = nullptr; - KernelTimePoint cpu_timestamp_; + ZeKernelTimePoint cpu_timestamp_; uint64_t gpu_timestamp_ = 0; std::mutex lock_; - KernelInfoMap kernel_info_map_; - KernelIntervalList kernel_interval_list_; - KernelNameMap kernel_name_map_; - std::list kernel_instance_list_; - CommandListMap command_list_map_; + ZeKernelInfoMap kernel_info_map_; + ZeKernelIntervalList kernel_interval_list_; + ZeKernelNameMap kernel_name_map_; + std::list kernel_instance_list_; + ZeCommandListMap command_list_map_; static const uint32_t kKernelLength = 10; static const uint32_t kCallsLength = 12; diff --git a/samples/ze_metric_info/main.cc b/samples/ze_metric_info/main.cc index 4910c1a..66cc083 100644 --- a/samples/ze_metric_info/main.cc +++ b/samples/ze_metric_info/main.cc @@ -1,5 +1,5 @@ //============================================================== -// Copyright © 2020 Intel Corporation +// Copyright (C) Intel Corporation // // SPDX-License-Identifier: MIT // ============================================================= diff --git a/samples/ze_metric_query/tool.cc b/samples/ze_metric_query/tool.cc index e781820..a2998fe 100644 --- a/samples/ze_metric_query/tool.cc +++ b/samples/ze_metric_query/tool.cc @@ -1,5 +1,5 @@ //============================================================== -// Copyright © 2020 Intel Corporation +// Copyright (C) Intel Corporation // // SPDX-License-Identifier: MIT // ============================================================= diff --git a/samples/ze_metric_query/ze_metric_collector.h b/samples/ze_metric_query/ze_metric_collector.h index 1a64a71..a3ae100 100644 --- a/samples/ze_metric_query/ze_metric_collector.h +++ b/samples/ze_metric_query/ze_metric_collector.h @@ -1,5 +1,5 @@ //============================================================== -// Copyright © 2020 Intel Corporation +// Copyright (C) Intel Corporation // // SPDX-License-Identifier: MIT // ============================================================= diff --git a/samples/ze_metric_streamer/tool.cc b/samples/ze_metric_streamer/tool.cc index b44f068..76936fe 100644 --- a/samples/ze_metric_streamer/tool.cc +++ b/samples/ze_metric_streamer/tool.cc @@ -1,5 +1,5 @@ //============================================================== -// Copyright © 2020 Intel Corporation +// Copyright (C) Intel Corporation // // SPDX-License-Identifier: MIT // ============================================================= @@ -86,7 +86,7 @@ static KernelMap GetKernelMap() { return KernelMap(); } - const KernelIntervalList& kernel_interval_list = + const ZeKernelIntervalList& kernel_interval_list = kernel_collector->GetKernelIntervalList(); if (kernel_interval_list.size() == 0) { return KernelMap(); diff --git a/samples/ze_metric_streamer/ze_metric_collector.h b/samples/ze_metric_streamer/ze_metric_collector.h index 097399d..4ac7c51 100644 --- a/samples/ze_metric_streamer/ze_metric_collector.h +++ b/samples/ze_metric_streamer/ze_metric_collector.h @@ -1,5 +1,5 @@ //============================================================== -// Copyright © 2020 Intel Corporation +// Copyright (C) Intel Corporation // // SPDX-License-Identifier: MIT // ============================================================= diff --git a/samples/ze_tracer/README.md b/samples/ze_tracer/README.md index 93697d6..4090abe 100644 --- a/samples/ze_tracer/README.md +++ b/samples/ze_tracer/README.md @@ -2,7 +2,7 @@ ## Overview This tool is an analogue of [Intercept Layer for OpenCL(TM) Applications](https://github.com/intel/opencl-intercept-layer) designed to support Level Zero. -Currently it has limited capabilities but expected to be fully functional eventually: +The following capabilities are available currently: ``` Usage: ./ze_tracer[.exe] [options] Options: @@ -31,8 +31,8 @@ Options: ``` === API Timing Results: === -Total Execution Time (ns): 418056422 -Total API Time (ns): 407283268 +Total Execution Time (ns): 418056422 + Total API Time (ns): 407283268 Function, Calls, Time (ns), Time (%), Average (ns), Min (ns), Max (ns) zeCommandQueueSynchronize, 4, 182529847, 44.82, 45632461, 45271728, 46364532 @@ -45,8 +45,8 @@ zeCommandQueueExecuteCommandLists, 4, 108593458, 26.66, ``` === Device Timing Results: === -Total Execution Time (ns): 376807360 -Total Device Time (ns): 178294707 +Total Execution Time (ns): 376807360 + Total Device Time (ns): 178294707 Kernel, Calls, SIMD, Transferred (bytes), Time (ns), Time (%), Average (ns), Min (ns), Max (ns) GEMM, 4, 32, 0, 173655671, 97.40, 43413917, 43343928, 43517564 @@ -55,10 +55,10 @@ zeCommandListAppendMemoryCopy, 12, 0, 50331648, ``` **Device Timeline** mode (***Linux kernel 5.0+ is required for accurate measurements***) dumps four timestamps for each device activity - *append* to the command list, *submit* to device queue, *start* and *end* on the device (all the timestamps are in CPU nanoseconds): ``` -Device Timeline for zeCommandListAppendMemoryCopy [ns] = 319154868 (append) 320972649 (submit) 320021623 (start) 320440290 (end) -Device Timeline for zeCommandListAppendMemoryCopy [ns] = 319281072 (append) 320972649 (submit) 320441707 (start) 320738290 (end) -Device Timeline for GEMM [ns] = 319344934 (append) 320972649 (submit) 320740123 (start) 364337290 (end) -Device Timeline for zeCommandListAppendMemoryCopy [ns] = 319348093 (append) 320972649 (submit) 364338873 (start) 364765123 (end) +Device Timeline (queue: 0x556fa2318fc0): zeCommandListAppendMemoryCopy [ns] = 396835703 (append) 398002195 (submit) 399757026 (start) 400230526 (end) +Device Timeline (queue: 0x556fa2318fc0): zeCommandListAppendMemoryCopy [ns] = 397039340 (append) 398002195 (submit) 400231776 (start) 400547193 (end) +Device Timeline (queue: 0x556fa2318fc0): GEMM [ns] = 397513563 (append) 398002195 (submit) 400548943 (start) 443632026 (end) +Device Timeline (queue: 0x556fa2318fc0): zeCommandListAppendMemoryCopy [ns] = 397632053 (append) 398002195 (submit) 443633526 (start) 444084943 (end) ... ``` **Chrome Device Timeline** mode dumps timestamps for device activities to JSON format that can be opened in [chrome://tracing](https://www.chromium.org/developers/how-tos/trace-event-profiling-tool) browser tool. diff --git a/samples/ze_tracer/tool.cc b/samples/ze_tracer/tool.cc index 81ec57a..9c3482c 100644 --- a/samples/ze_tracer/tool.cc +++ b/samples/ze_tracer/tool.cc @@ -1,5 +1,5 @@ //============================================================== -// Copyright © 2020 Intel Corporation +// Copyright (C) Intel Corporation // // SPDX-License-Identifier: MIT // ============================================================= diff --git a/samples/ze_tracer/ze_tracer.h b/samples/ze_tracer/ze_tracer.h index 7b4e929..610eeff 100644 --- a/samples/ze_tracer/ze_tracer.h +++ b/samples/ze_tracer/ze_tracer.h @@ -1,5 +1,5 @@ //============================================================== -// Copyright © 2020 Intel Corporation +// Copyright (C) Intel Corporation // // SPDX-License-Identifier: MIT // ============================================================= @@ -12,10 +12,7 @@ #include #include #include -#include -#include -#include -#include +#include #include #include "ze_api_collector.h" @@ -47,15 +44,14 @@ class ZeTracer { tracer->CheckOption(ZET_CHROME_CALL_LOGGING) || tracer->CheckOption(ZET_HOST_TIMING)) { - OnFunctionFinishCallback callback = nullptr; + OnZeFunctionFinishCallback callback = nullptr; if (tracer->CheckOption(ZET_CHROME_CALL_LOGGING)) { callback = ChromeLoggingCallback; } bool call_tracing = tracer->CheckOption(ZET_CALL_LOGGING); api_collector = ZeApiCollector::Create( - tracer->start_time_, call_tracing, - callback, tracer); + tracer->start_time_, call_tracing, callback, tracer); if (api_collector == nullptr) { std::cerr << "[WARNING] Unable to create API collector" << std::endl; delete tracer; @@ -69,7 +65,7 @@ class ZeTracer { tracer->CheckOption(ZET_CHROME_DEVICE_TIMELINE) || tracer->CheckOption(ZET_DEVICE_TIMING)) { - OnKernelFinishCallback callback = nullptr; + OnZeKernelFinishCallback callback = nullptr; if (tracer->CheckOption(ZET_DEVICE_TIMELINE) && tracer->CheckOption(ZET_CHROME_DEVICE_TIMELINE)) { callback = DeviceAndChromeTimelineCallback; @@ -141,7 +137,7 @@ class ZeTracer { void ReportHostTiming() { PTI_ASSERT(api_collector_ != nullptr); - const FunctionInfoMap& function_info_map = + const ZeFunctionInfoMap& function_info_map = api_collector_->GetFunctionInfoMap(); if (function_info_map.size() == 0) { return; @@ -152,12 +148,17 @@ class ZeTracer { total_duration += value.second.total_time; } + std::string title = "Total Execution Time (ns): "; + const size_t title_width = title.size(); + const size_t time_width = 20; + std::cerr << std::endl; std::cerr << "=== API Timing Results: ===" << std::endl; std::cerr << std::endl; - std::cerr << "Total Execution Time (ns): " << - total_execution_time_ << std::endl; - std::cerr << "Total API Time (ns): " << total_duration << std::endl; + std::cerr << std::setw(title_width) << title << + std::setw(time_width) << total_execution_time_ << std::endl; + std::cerr << std::setw(title_width) << "Total API Time (ns): " << + std::setw(time_width) << total_duration << std::endl; std::cerr << std::endl; if (total_duration > 0) { @@ -168,7 +169,7 @@ class ZeTracer { void ReportDeviceTiming() { PTI_ASSERT(kernel_collector_ != nullptr); - const KernelInfoMap& kernel_info_map = + const ZeKernelInfoMap& kernel_info_map = kernel_collector_->GetKernelInfoMap(); if (kernel_info_map.size() == 0) { return; @@ -179,12 +180,17 @@ class ZeTracer { total_duration += value.second.total_time; } + std::string title = "Total Execution Time (ns): "; + const size_t title_width = title.size(); + const size_t time_width = 20; + std::cerr << std::endl; std::cerr << "=== Device Timing Results: ===" << std::endl; std::cerr << std::endl; - std::cerr << "Total Execution Time (ns): " << - total_execution_time_ << std::endl; - std::cerr << "Total Device Time (ns): " << total_duration << std::endl; + std::cerr << std::setw(title_width) << title << + std::setw(time_width) << total_execution_time_ << std::endl; + std::cerr << std::setw(title_width) << "Total Device Time (ns): " << + std::setw(time_width) << total_duration << std::endl; std::cerr << std::endl; if (total_duration > 0) { @@ -203,14 +209,17 @@ class ZeTracer { } static void DeviceTimelineCallback( - void* data, const std::string& name, + void* data, void* queue, const std::string& name, uint64_t appended, uint64_t submitted, uint64_t started, uint64_t ended) { - std::cerr << "Device Timeline for " << name << " [ns] = " << + std::stringstream stream; + stream << "Device Timeline (queue: " << queue << + "): " << name << " [ns] = " << appended << " (append) " << submitted << " (submit) " << started << " (start) " << ended << " (end)" << std::endl; + std::cerr << stream.str(); } void OpenTraceFile() { @@ -231,24 +240,27 @@ class ZeTracer { } static void ChromeTimelineCallback( - void* data, const std::string& name, + void* data, void* queue, const std::string& name, uint64_t appended, uint64_t submitted, uint64_t started, uint64_t ended) { ZeTracer* tracer = reinterpret_cast(data); PTI_ASSERT(tracer != nullptr); - tracer->chrome_trace_ << "{\"ph\":\"X\", \"pid\":" << - utils::GetPid() << ", \"tid\":0, \"name\":\"" << name << + std::stringstream stream; + stream << "{\"ph\":\"X\", \"pid\":" << utils::GetPid() << + ", \"tid\":" << reinterpret_cast(queue) << + ", \"name\":\"" << name << "\", \"ts\": " << started / NSEC_IN_USEC << ", \"dur\":" << (ended - started) / NSEC_IN_USEC << "}," << std::endl; + tracer->chrome_trace_ << stream.str(); } static void DeviceAndChromeTimelineCallback( - void* data, const std::string& name, + void* data, void* queue, const std::string& name, uint64_t appended, uint64_t submitted, uint64_t started, uint64_t ended) { - DeviceTimelineCallback(data, name, appended, submitted, started, ended); - ChromeTimelineCallback(data, name, appended, submitted, started, ended); + DeviceTimelineCallback(data, queue, name, appended, submitted, started, ended); + ChromeTimelineCallback(data, queue, name, appended, submitted, started, ended); } static void ChromeLoggingCallback( @@ -256,12 +268,14 @@ class ZeTracer { uint64_t started, uint64_t ended) { ZeTracer* tracer = reinterpret_cast(data); PTI_ASSERT(tracer != nullptr); - tracer->chrome_trace_ << "{\"ph\":\"X\", \"pid\":" << + std::stringstream stream; + stream << "{\"ph\":\"X\", \"pid\":" << utils::GetPid() << ", \"tid\":" << utils::GetTid() << ", \"name\":\"" << name << "\", \"ts\": " << started / NSEC_IN_USEC << ", \"dur\":" << (ended - started) / NSEC_IN_USEC << "}," << std::endl; + tracer->chrome_trace_ << stream.str(); } private: diff --git a/tests/cl_tracer.py b/tests/cl_tracer.py new file mode 100644 index 0000000..1680013 --- /dev/null +++ b/tests/cl_tracer.py @@ -0,0 +1,68 @@ +import os +import subprocess +import sys + +import cl_gemm +import utils + +def config(path): + p = subprocess.Popen(["cmake",\ + "-DCMAKE_BUILD_TYPE=" + utils.get_build_flag(), ".."],\ + cwd = path, stdout = subprocess.PIPE, stderr = subprocess.PIPE) + p.wait() + stdout, stderr = utils.run_process(p) + if stderr and stderr.find("CMake Error") != -1: + return stderr + return None + +def build(path): + p = subprocess.Popen(["make"], cwd = path,\ + stdout = subprocess.PIPE, stderr = subprocess.PIPE) + p.wait() + stdout, stderr = utils.run_process(p) + if stderr and stderr.lower().find("error") != -1: + return stderr + return None + +def run(path, option): + app_folder = utils.get_sample_build_path("cl_gemm") + app_file = os.path.join(app_folder, "cl_gemm") + p = subprocess.Popen(["./cl_tracer", option, app_file, "cpu", "1024", "1"],\ + cwd = path, stdout = subprocess.PIPE, stderr = subprocess.PIPE) + stdout, stderr = utils.run_process(p) + if not stderr: + return stdout + if stdout.find(" CORRECT") == -1: + return stdout + return None + +def main(option): + path = utils.get_sample_build_path("cl_tracer") + log = cl_gemm.main("cpu") + if log: + return log + log = config(path) + if log: + return log + log = build(path) + if log: + return log + log = run(path, option) + if log: + return log + +if __name__ == "__main__": + option = "-c" + if len(sys.argv) > 1 and sys.argv[1] == "-h": + option = "-h" + if len(sys.argv) > 1 and sys.argv[1] == "-d": + option = "-d" + if len(sys.argv) > 1 and sys.argv[1] == "-t": + option = "-t" + if len(sys.argv) > 1 and sys.argv[1] == "--chrome-device-timeline": + option = "--chrome-device-timeline" + if len(sys.argv) > 1 and sys.argv[1] == "--chrome-call-logging": + option = "--chrome-call-logging" + log = main(option) + if log: + print(log) \ No newline at end of file diff --git a/tests/dpc_gemm.py b/tests/dpc_gemm.py index 2fd0a6f..d0683e4 100644 --- a/tests/dpc_gemm.py +++ b/tests/dpc_gemm.py @@ -5,9 +5,8 @@ import utils def config(path): - e = utils.add_env(None, "CXX", "dpcpp") p = subprocess.Popen(["cmake",\ - "-DCMAKE_BUILD_TYPE=" + utils.get_build_flag(), ".."], env = e,\ + "-DCMAKE_BUILD_TYPE=" + utils.get_build_flag(), ".."],\ cwd = path, stdout = subprocess.PIPE, stderr = subprocess.PIPE) p.wait() stdout, stderr = utils.run_process(p) diff --git a/tests/dpc_info.py b/tests/dpc_info.py index a92d76a..e608185 100644 --- a/tests/dpc_info.py +++ b/tests/dpc_info.py @@ -5,9 +5,8 @@ import utils def config(path): - e = utils.add_env(None, "CXX", "dpcpp") p = subprocess.Popen(["cmake",\ - "-DCMAKE_BUILD_TYPE=" + utils.get_build_flag(), ".."], env = e,\ + "-DCMAKE_BUILD_TYPE=" + utils.get_build_flag(), ".."],\ cwd = path, stdout = subprocess.PIPE, stderr = subprocess.PIPE) p.wait() stdout, stderr = utils.run_process(p) diff --git a/tests/omp_gemm.py b/tests/omp_gemm.py index 69f67bb..a7e6c43 100644 --- a/tests/omp_gemm.py +++ b/tests/omp_gemm.py @@ -5,9 +5,8 @@ import utils def config(path): - e = utils.add_env(None, "CXX", "icpx") p = subprocess.Popen(["cmake",\ - "-DCMAKE_BUILD_TYPE=" + utils.get_build_flag(), ".."], env = e,\ + "-DCMAKE_BUILD_TYPE=" + utils.get_build_flag(), ".."],\ cwd = path, stdout = subprocess.PIPE, stderr = subprocess.PIPE) p.wait() stdout, stderr = utils.run_process(p) diff --git a/tests/omp_hot_regions.py b/tests/omp_hot_regions.py index 998f434..c74549a 100644 --- a/tests/omp_hot_regions.py +++ b/tests/omp_hot_regions.py @@ -6,9 +6,8 @@ import utils def config(path): - e = utils.add_env(None, "CXX", "icpx") p = subprocess.Popen(["cmake",\ - "-DCMAKE_BUILD_TYPE=" + utils.get_build_flag(), ".."], env = e,\ + "-DCMAKE_BUILD_TYPE=" + utils.get_build_flag(), ".."],\ cwd = path, stdout = subprocess.PIPE, stderr = subprocess.PIPE) p.wait() stdout, stderr = utils.run_process(p) diff --git a/tests/onetrace.py b/tests/onetrace.py new file mode 100644 index 0000000..7ff1fa2 --- /dev/null +++ b/tests/onetrace.py @@ -0,0 +1,68 @@ +import os +import subprocess +import sys + +import dpc_gemm +import utils + +def config(path): + p = subprocess.Popen(["cmake",\ + "-DCMAKE_BUILD_TYPE=" + utils.get_build_flag(), ".."],\ + cwd = path, stdout = subprocess.PIPE, stderr = subprocess.PIPE) + p.wait() + stdout, stderr = utils.run_process(p) + if stderr and stderr.find("CMake Error") != -1: + return stderr + return None + +def build(path): + p = subprocess.Popen(["make"], cwd = path,\ + stdout = subprocess.PIPE, stderr = subprocess.PIPE) + p.wait() + stdout, stderr = utils.run_process(p) + if stderr and stderr.lower().find("error") != -1: + return stderr + return None + +def run(path, option): + app_folder = utils.get_sample_build_path("dpc_gemm") + app_file = os.path.join(app_folder, "dpc_gemm") + p = subprocess.Popen(["./onetrace", option, app_file, "gpu", "1024", "1"],\ + cwd = path, stdout = subprocess.PIPE, stderr = subprocess.PIPE) + stdout, stderr = utils.run_process(p) + if not stderr: + return stdout + if stdout.find(" CORRECT") == -1: + return stdout + return None + +def main(option): + path = utils.get_tool_build_path("onetrace") + log = dpc_gemm.main("gpu") + if log: + return log + log = config(path) + if log: + return log + log = build(path) + if log: + return log + log = run(path, option) + if log: + return log + +if __name__ == "__main__": + option = "-c" + if len(sys.argv) > 1 and sys.argv[1] == "-h": + option = "-h" + if len(sys.argv) > 1 and sys.argv[1] == "-d": + option = "-d" + if len(sys.argv) > 1 and sys.argv[1] == "-t": + option = "-t" + if len(sys.argv) > 1 and sys.argv[1] == "--chrome-device-timeline": + option = "--chrome-device-timeline" + if len(sys.argv) > 1 and sys.argv[1] == "--chrome-call-logging": + option = "--chrome-call-logging" + log = main(option) + if log: + print(log) \ No newline at end of file diff --git a/tests/run.py b/tests/run.py index b48fc54..e26c980 100644 --- a/tests/run.py +++ b/tests/run.py @@ -13,6 +13,7 @@ ["cl_gpu_metrics", None], ["cl_hot_functions", "gpu", "cpu", "dpc", "omp"], ["cl_hot_kernels", "gpu", "cpu", "dpc", "omp"], + ["cl_tracer", "-c", "-h", "-d", "-t", "--chrome-device-timeline", "--chrome-call-logging"], ["gpu_info", "-d", "-m"], ["gpu_inst_count", "cl", "ze", "dpc"], ["gpu_perfmon_read", "cl", "ze", "dpc"], @@ -31,6 +32,8 @@ ["dpc_gemm", "gpu", "cpu", "host"], ["dpc_info", "-a", "-l"]] +tools = [["onetrace", "-c", "-h", "-d", "-t", "--chrome-device-timeline", "--chrome-call-logging"]] + def remove_python_cache(path): files = os.listdir(path) for file in files: @@ -46,7 +49,12 @@ def clean(): path = utils.get_sample_build_path(sample[0]) if os.path.exists(path): shutil.rmtree(path) - + + for tool in tools: + path = utils.get_tool_build_path(tool[0]) + if os.path.exists(path): + shutil.rmtree(path) + remove_python_cache(utils.get_build_utils_path()) remove_python_cache(utils.get_script_path()) @@ -55,20 +63,26 @@ def clean(): if file.endswith(".log"): os.remove(os.path.join(root, file)) -def test(f, sample, option): - if option: - sys.stdout.write("Running sample test for " + sample + " (" + option + ")...") +def test(f, name, option, istool = False): + if istool: + if option: + sys.stdout.write("Running tool test for " + name + " (" + option + ")...") + else: + sys.stdout.write("Running tool test for " + name + "...") else: - sys.stdout.write("Running sample test for " + sample + "...") + if option: + sys.stdout.write("Running sample test for " + name + " (" + option + ")...") + else: + sys.stdout.write("Running sample test for " + name + "...") sys.stdout.flush() - module = importlib.import_module(sample) + module = importlib.import_module(name) log = module.main(option) if log: sys.stdout.write("FAILED\n") if option: - f.write("======= " + sample + " (" + option + ") =======\n") + f.write("======= " + name + " (" + option + ") =======\n") else: - f.write("======= " + sample + " =======\n") + f.write("======= " + name + " =======\n") f.write(log) return False else: @@ -100,6 +114,16 @@ def main(): else: tests_failed += 1 + for tool in tools: + name = tool[0] + if re.search(tmpl, name) == None: + continue + for i in range(1, len(tool)): + if test(f, name, tool[i], True): + tests_passed += 1 + else: + tests_failed += 1 + f.close() print("PASSED: " + str(tests_passed) + " / FAILED: " + str(tests_failed)) diff --git a/tests/utils.py b/tests/utils.py index 4fb1655..f3178e8 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -19,6 +19,16 @@ def get_sample_build_path(name): os.mkdir(path) return path +def get_tool_build_path(name): + head, tail = os.path.split(get_script_path()) + path = os.path.join(head, "tools") + path = os.path.join(path, name) + assert os.path.exists(path) + path = os.path.join(path, "build") + if not os.path.exists(path): + os.mkdir(path) + return path + def get_build_utils_path(): head, tail = os.path.split(get_script_path()) path = os.path.join(head, "samples") diff --git a/tools/onetrace/CMakeLists.txt b/tools/onetrace/CMakeLists.txt new file mode 100644 index 0000000..9973042 --- /dev/null +++ b/tools/onetrace/CMakeLists.txt @@ -0,0 +1,51 @@ +include("../../samples/build_utils/CMakeLists.txt") +SetRequiredCMakeVersion() +cmake_minimum_required(VERSION ${REQUIRED_CMAKE_VERSION}) + +project(PTI_Tools_OneTrace CXX) +SetCompilerFlags() +SetBuildType() + +# Tool Library + +add_library(onetrace_tool SHARED + "${PROJECT_SOURCE_DIR}/../../samples/utils/trace_guard.cc" + "${PROJECT_SOURCE_DIR}/../../samples/loader/init.cc" + tool.cc) +target_include_directories(onetrace_tool + PRIVATE "${PROJECT_SOURCE_DIR}" + PRIVATE "${PROJECT_SOURCE_DIR}/../../samples/utils" + PRIVATE "${PROJECT_SOURCE_DIR}/../../samples/cl_hot_functions" + PRIVATE "${PROJECT_SOURCE_DIR}/../../samples/cl_hot_kernels" + PRIVATE "${PROJECT_SOURCE_DIR}/../../samples/ze_hot_functions" + PRIVATE "${PROJECT_SOURCE_DIR}/../../samples/ze_hot_kernels") +if(CMAKE_INCLUDE_PATH) + target_include_directories(onetrace_tool + PUBLIC "${CMAKE_INCLUDE_PATH}") +endif() + +FindOpenCLLibrary(onetrace_tool) +FindOpenCLHeaders(onetrace_tool) + +GetOpenCLTracingHeaders(onetrace_tool) + +FindL0Library(onetrace_tool) +FindL0Headers(onetrace_tool) + +FindL0HeadersPath(onetrace_tool "${PROJECT_SOURCE_DIR}/../../samples/ze_hot_functions/gen_tracing_callbacks.py") + +if(UNIX) + FindDRMLibrary(onetrace_tool) + CheckDRMHeaders(onetrace_tool) +endif() + +# Loader + +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DTOOL_NAME=onetrace_tool") +add_executable(onetrace "${PROJECT_SOURCE_DIR}/../../samples/loader/loader.cc") +target_include_directories(onetrace + PRIVATE "${PROJECT_SOURCE_DIR}/../../samples/utils") +if(UNIX) + target_link_libraries(onetrace + dl) +endif() \ No newline at end of file diff --git a/tools/onetrace/README.md b/tools/onetrace/README.md new file mode 100644 index 0000000..c2326e0 --- /dev/null +++ b/tools/onetrace/README.md @@ -0,0 +1,141 @@ +# Tracing and Profiling Tool for Data Parallel C++ (DPC++) +## Overview +This tool provides basic tracing and profiling capabilities for the compute applications based on Intel runtimes for OpenCL(TM) and Level Zero, like DPC++ and OpenMP* GPU offload programs. + +The following capabilities are available: +``` +Usage: ./onetrace[.exe] [options] +Options: +--call-logging [-c] Trace host API calls +--host-timing [-h] Report host API execution time +--device-timing [-d] Report kernels exectucion time +--device-timeline [-t] Trace device activities +--chrome-device-timeline Dump device activities to JSON file +--chrome-call-logging Dump host API calls to JSON file +``` + +**Call Logging** mode allows to grab full host API trace, e.g.: +``` +... +>>>> [271632470] clCreateBuffer: context = 0x5591dba3f860 flags = 4 size = 4194304 hostPtr = 0 errcodeRet = 0x7ffd334b2f04 +<<<< [271640078] clCreateBuffer [7608 ns] result = 0x5591dbaa5760 -> CL_SUCCESS (0) +>>>> [272171119] clEnqueueWriteBuffer: commandQueue = 0x5591dbf4be70 buffer = 0x5591dbaa5760 blockingWrite = 1 offset = 0 cb = 4194304 ptr = 0x5591dc92af90 numEventsInWaitList = 0 eventWaitList = 0 event = 0 +<<<< [272698660] clEnqueueWriteBuffer [527541 ns] -> CL_SUCCESS (0) +>>>> [272716922] clSetKernelArg: kernel = 0x5591dc500c60 argIndex = 0 argSize = 8 argValue = 0x7ffd334b2f10 +<<<< [272724034] clSetKernelArg [7112 ns] -> CL_SUCCESS (0) +>>>> [272729938] clSetKernelArg: kernel = 0x5591dc500c60 argIndex = 1 argSize = 8 argValue = 0x7ffd334b2f18 +<<<< [272733712] clSetKernelArg [3774 ns] -> CL_SUCCESS (0) +... +``` +**Chrome Call Logging** mode dumps API calls to JSON format that can be opened in [chrome://tracing](https://www.chromium.org/developers/how-tos/trace-event-profiling-tool) browser tool. + +**Host Timing** mode collects duration for each API call and provides the summary for the whole application: +``` +=== API Timing Results: === + + Total Execution Time (ns): 372547856 + Total API Time for L0 backend (ns): 355680113 +Total API Time for CL CPU backend (ns): 7119 +Total API Time for CL GPU backend (ns): 2550 + +== L0 Backend: == + + Function, Calls, Time (ns), Time (%), Average (ns), Min (ns), Max (ns) + zeEventHostSynchronize, 32, 181510841, 51.03, 5672213, 72, 45327080 + zeModuleCreate, 1, 96564991, 27.15, 96564991, 96564991, 96564991 + zeCommandQueueExecuteCommandLists, 8, 76576727, 21.53, 9572090, 20752, 76024831 +... + +== CL CPU Backend: == + + Function, Calls, Time (ns), Time (%), Average (ns), Min (ns), Max (ns) + clGetDeviceInfo, 6, 3094, 43.46, 515, 216, 1295 +clGetPlatformInfo, 2, 1452, 20.40, 726, 487, 965 + clGetDeviceIDs, 4, 987, 13.86, 246, 93, 513 +... + +== CL GPU Backend: == + + Function, Calls, Time (ns), Time (%), Average (ns), Min (ns), Max (ns) + clGetDeviceIDs, 4, 955, 37.45, 238, 153, 352 + clGetDeviceInfo, 6, 743, 29.14, 123, 65, 244 + clReleaseDevice, 2, 331, 12.98, 165, 134, 197 +... +``` +**Device Timing** mode collects duration for each kernel on the device and provides the summary for the whole application: +``` +=== Device Timing Results: === + + Total Execution Time (ns): 362771691 +Total Device Time for CL GPU backend (ns): 176849013 + +== CL GPU Backend: == + + Kernel, Calls, SIMD, Transferred (bytes), Time (ns), Time (%), Average (ns), Min (ns), Max (ns) + GEMM, 4, 32, 0, 171239582, 96.83, 42809895, 42567333, 43121083 +clEnqueueWriteBuffer, 8, 0, 33554432, 3362082, 1.90, 420260, 298500, 532500 + clEnqueueReadBuffer, 4, 0, 16777216, 2247349, 1.27, 561837, 556520, 565134 +... +``` +**Device Timeline** mode dumps four timestamps for each device activity - *queued* to the host command queue for OpenCL(TM) or "append" to the command list for Level Zero, *submit* to device queue, *start* and *end* on the device (all the timestamps are in CPU nanoseconds): +``` +... +Device Timeline (queue: 0x55a9c7e51e70): clEnqueueWriteBuffer [ns] = 317341082 (queued) 317355010 (submit) 317452332 (start) 317980165 (end) +Device Timeline (queue: 0x55a9c7e51e70): clEnqueueWriteBuffer [ns] = 317789774 (queued) 317814558 (submit) 318160607 (start) 318492690 (end) +Device Timeline (queue: 0x55a9c7e51e70): GEMM [ns] = 318185764 (queued) 318200629 (submit) 318550014 (start) 361260930 (end) +Device Timeline (queue: 0x55a9c7e51e70): clEnqueueReadBuffer [ns] = 361479600 (queued) 361481387 (submit) 361482574 (start) 362155593 (end) +... +``` +**Chrome Device Timeline** mode dumps timestamps for device activities to JSON format that can be opened in [chrome://tracing](https://www.chromium.org/developers/how-tos/trace-event-profiling-tool) browser tool. + +## Supported OS +- Linux +- Windows (*under development*) + +## Prerequisites +- [CMake](https://cmake.org/) (version 2.8 and above) +- [Git](https://git-scm.com/) (version 1.8 and above) +- [Python](https://www.python.org/) (version 2.7 and above) +- [OpenCL(TM) ICD Loader](https://github.com/KhronosGroup/OpenCL-ICD-Loader) +- [oneAPI Level Zero loader](https://github.com/oneapi-src/level-zero) +- [Intel(R) Graphics Compute Runtime for oneAPI Level Zero and OpenCL(TM) Driver](https://github.com/intel/compute-runtime) to run on GPU +- [Intel(R) Xeon(R) Processor / Intel(R) Core(TM) Processor (CPU) Runtimes](https://software.intel.com/en-us/articles/opencl-drivers#cpu-section) to run on CPU +- [libdrm](https://gitlab.freedesktop.org/mesa/drm) + +## Build and Run +### Linux +Run the following commands to build the sample: +```sh +cd /samples/onetrace +mkdir build +cd build +cmake -DCMAKE_BUILD_TYPE=Release .. +make +``` +Use this command line to run the tool: +```sh +./onetrace [options] +``` +One may use e.g. [dpc_gemm](../../samples/dpc_gemm) as target application, e.g.: +```sh +./onetrace -c -h ../../dpc_gemm/build/dpc_gemm cpu +./onetrace -c -h ../../dpc_gemm/build/dpc_gemm gpu +``` +### Windows +Use Microsoft* Visual Studio x64 command prompt to run the following commands and build the sample: +```sh +cd \samples\onetrace +mkdir build +cd build +cmake -G "NMake Makefiles" -DCMAKE_BUILD_TYPE=Release -DCMAKE_LIBRARY_PATH= .. +nmake +``` +Use this command line to run the tool: +```sh +onetrace.exe [options] +``` +One may use e.g. [dpc_gemm](../../samples/dpc_gemm) as target application, e.g.: +```sh +onetrace.exe -c -h ..\..\dpc_gemm\build\dpc_gemm.exe cpu +onetrace.exe -c -h ..\..\dpc_gemm\build\dpc_gemm.exe gpu +``` \ No newline at end of file diff --git a/tools/onetrace/tool.cc b/tools/onetrace/tool.cc new file mode 100644 index 0000000..fcd9cd8 --- /dev/null +++ b/tools/onetrace/tool.cc @@ -0,0 +1,141 @@ +//============================================================== +// Copyright (C) Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= + +#include + +#include "unified_tracer.h" + +static UnifiedTracer* tracer = nullptr; + +extern "C" +#if defined(_WIN32) +__declspec(dllexport) +#endif +void Usage() { + std::cout << + "Usage: ./onetrace[.exe] [options] " << + std::endl; + std::cout << "Options:" << std::endl; + std::cout << + "--call-logging [-c] Trace host API calls" << + std::endl; + std::cout << + "--host-timing [-h] Report host API execution time" << + std::endl; + std::cout << + "--device-timing [-d] Report kernels execution time" << + std::endl; + std::cout << + "--device-timeline [-t] Trace device activities" << + std::endl; + std::cout << + "--chrome-device-timeline Dump device activities to JSON file" << + std::endl; + std::cout << + "--chrome-call-logging Dump host API calls to JSON file" << + std::endl; +} + +extern "C" +#if defined(_WIN32) +__declspec(dllexport) +#endif +int ParseArgs(int argc, char* argv[]) { + int app_index = 1; + for (int i = 1; i < argc; ++i) { + if (strcmp(argv[i], "--call-logging") == 0 || + strcmp(argv[i], "-c") == 0) { + utils::SetEnv("ONETRACE_CallLogging=1"); + ++app_index; + } else if (strcmp(argv[i], "--host-timing") == 0 || + strcmp(argv[i], "-h") == 0) { + utils::SetEnv("ONETRACE_HostTiming=1"); + ++app_index; + } else if (strcmp(argv[i], "--device-timing") == 0 || + strcmp(argv[i], "-d") == 0) { + utils::SetEnv("ONETRACE_DeviceTiming=1"); + ++app_index; + } else if (strcmp(argv[i], "--device-timeline") == 0 || + strcmp(argv[i], "-t") == 0) { + utils::SetEnv("ONETRACE_DeviceTimeline=1"); + ++app_index; + } else if (strcmp(argv[i], "--chrome-device-timeline") == 0) { + utils::SetEnv("ONETRACE_ChromeDeviceTimeline=1"); + ++app_index; + } else if (strcmp(argv[i], "--chrome-call-logging") == 0) { + utils::SetEnv("ONETRACE_ChromeCallLogging=1"); + ++app_index; + } else { + break; + } + } + return app_index; +} + +extern "C" +#if defined(_WIN32) +__declspec(dllexport) +#endif +void SetToolEnv() { + utils::SetEnv("ZE_ENABLE_TRACING_LAYER=1"); +} + +static unsigned ReadArgs() { + std::string value; + unsigned options = 0; + + value = utils::GetEnv("ONETRACE_CallLogging"); + if (!value.empty() && value == "1") { + options |= (1 << ONETRACE_CALL_LOGGING); + } + + value = utils::GetEnv("ONETRACE_HostTiming"); + if (!value.empty() && value == "1") { + options |= (1 << ONETRACE_HOST_TIMING); + } + + value = utils::GetEnv("ONETRACE_DeviceTiming"); + if (!value.empty() && value == "1") { + options |= (1 << ONETRACE_DEVICE_TIMING); + } + + value = utils::GetEnv("ONETRACE_DeviceTimeline"); + if (!value.empty() && value == "1") { + options |= (1 << ONETRACE_DEVICE_TIMELINE); + } + + value = utils::GetEnv("ONETRACE_ChromeDeviceTimeline"); + if (!value.empty() && value == "1") { + options |= (1 << ONETRACE_CHROME_DEVICE_TIMELINE); + } + + value = utils::GetEnv("ONETRACE_ChromeCallLogging"); + if (!value.empty() && value == "1") { + options |= (1 << ONETRACE_CHROME_CALL_LOGGING); + } + + return options; +} + +void EnableProfiling() { + ze_result_t status = ZE_RESULT_SUCCESS; + status = zeInit(ZE_INIT_FLAG_GPU_ONLY); + PTI_ASSERT(status == ZE_RESULT_SUCCESS); + + unsigned options = ReadArgs(); + if (options == 0) { + options |= (1 << ONETRACE_HOST_TIMING); + options |= (1 << ONETRACE_DEVICE_TIMING); + } + + tracer = UnifiedTracer::Create(options); +} + +void DisableProfiling() { + if (tracer != nullptr) { + delete tracer; + } +} \ No newline at end of file diff --git a/tools/onetrace/unified_tracer.h b/tools/onetrace/unified_tracer.h new file mode 100644 index 0000000..b7dd35f --- /dev/null +++ b/tools/onetrace/unified_tracer.h @@ -0,0 +1,528 @@ +//============================================================== +// Copyright (C) Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= + +#ifndef PTI_SAMPLES_ONETRACE_UNIFIED_TRACER_H_ +#define PTI_SAMPLES_ONETRACE_UNIFIED_TRACER_H_ + +#include +#include +#include +#include +#include +#include +#include + +#include "cl_api_collector.h" +#include "cl_kernel_collector.h" +#include "utils.h" +#include "ze_api_collector.h" +#include "ze_kernel_collector.h" + +#define ONETRACE_CALL_LOGGING 0 +#define ONETRACE_HOST_TIMING 1 +#define ONETRACE_DEVICE_TIMING 2 +#define ONETRACE_DEVICE_TIMELINE 3 +#define ONETRACE_CHROME_DEVICE_TIMELINE 4 +#define ONETRACE_CHROME_CALL_LOGGING 5 + +const char* kChromeTraceFileName = "onetrace.json"; + +class UnifiedTracer { + public: + static UnifiedTracer* Create(unsigned options) { + cl_device_id cl_cpu_device = utils::cl::GetIntelDevice(CL_DEVICE_TYPE_CPU); + cl_device_id cl_gpu_device = utils::cl::GetIntelDevice(CL_DEVICE_TYPE_GPU); + if (cl_cpu_device == nullptr && cl_gpu_device == nullptr) { + std::cerr << "[WARNING] Intel OpenCL devices are not found" << std::endl; + return nullptr; + } + + UnifiedTracer* tracer = new UnifiedTracer(options); + + if (tracer->CheckOption(ONETRACE_CALL_LOGGING) || + tracer->CheckOption(ONETRACE_CHROME_CALL_LOGGING) || + tracer->CheckOption(ONETRACE_HOST_TIMING)) { + + ZeApiCollector* ze_api_collector = nullptr; + ClApiCollector* cl_cpu_api_collector = nullptr; + ClApiCollector* cl_gpu_api_collector = nullptr; + + OnZeFunctionFinishCallback ze_callback = nullptr; + OnClFunctionFinishCallback cl_callback = nullptr; + if (tracer->CheckOption(ONETRACE_CHROME_CALL_LOGGING)) { + ze_callback = ChromeLoggingCallback; + cl_callback = ChromeLoggingCallback; + } + + bool call_tracing = tracer->CheckOption(ONETRACE_CALL_LOGGING); + + ze_api_collector = ZeApiCollector::Create( + tracer->start_time_, call_tracing, ze_callback, tracer); + if (ze_api_collector == nullptr) { + std::cerr << "[WARNING] Unable to create L0 API collector" << + std::endl; + } + tracer->ze_api_collector_ = ze_api_collector; + + if (cl_cpu_device != nullptr) { + cl_cpu_api_collector = ClApiCollector::Create( + cl_cpu_device, tracer->start_time_, + call_tracing, cl_callback, tracer); + if (cl_cpu_api_collector == nullptr) { + std::cerr << + "[WARNING] Unable to create CL API collector for CPU backend" << + std::endl; + } + tracer->cl_cpu_api_collector_ = cl_cpu_api_collector; + } + + if (cl_gpu_device != nullptr) { + cl_gpu_api_collector = ClApiCollector::Create( + cl_gpu_device, tracer->start_time_, + call_tracing, cl_callback, tracer); + if (cl_gpu_api_collector == nullptr) { + std::cerr << + "[WARNING] Unable to create CL API collector for GPU backend" << + std::endl; + } + tracer->cl_gpu_api_collector_ = cl_gpu_api_collector; + } + + if (ze_api_collector == nullptr && + cl_gpu_api_collector == nullptr && + cl_cpu_api_collector == nullptr) { + delete tracer; + return nullptr; + } + } + + if (tracer->CheckOption(ONETRACE_DEVICE_TIMELINE) || + tracer->CheckOption(ONETRACE_CHROME_DEVICE_TIMELINE) || + tracer->CheckOption(ONETRACE_DEVICE_TIMING)) { + + ZeKernelCollector* ze_kernel_collector = nullptr; + ClKernelCollector* cl_cpu_kernel_collector = nullptr; + ClKernelCollector* cl_gpu_kernel_collector = nullptr; + + OnZeKernelFinishCallback ze_callback = nullptr; + OnClKernelFinishCallback cl_callback = nullptr; + if (tracer->CheckOption(ONETRACE_DEVICE_TIMELINE) && + tracer->CheckOption(ONETRACE_CHROME_DEVICE_TIMELINE)) { + ze_callback = DeviceAndChromeTimelineCallback; + cl_callback = DeviceAndChromeTimelineCallback; + } else if (tracer->CheckOption(ONETRACE_DEVICE_TIMELINE)) { + ze_callback = DeviceTimelineCallback; + cl_callback = DeviceTimelineCallback; + } else if (tracer->CheckOption(ONETRACE_CHROME_DEVICE_TIMELINE)) { + ze_callback = ChromeTimelineCallback; + cl_callback = ChromeTimelineCallback; + } + + ze_kernel_collector = ZeKernelCollector::Create( + tracer->start_time_, ze_callback, tracer); + if (ze_kernel_collector == nullptr) { + std::cerr << + "[WARNING] Unable to create kernel collector for L0 backend" << + std::endl; + } + tracer->ze_kernel_collector_ = ze_kernel_collector; + + if (cl_cpu_device != nullptr) { + cl_cpu_kernel_collector = ClKernelCollector::Create( + cl_cpu_device, tracer->start_time_, cl_callback, tracer); + if (cl_cpu_kernel_collector == nullptr) { + std::cerr << + "[WARNING] Unable to create kernel collector for CL CPU backend" << + std::endl; + } + tracer->cl_cpu_kernel_collector_ = cl_cpu_kernel_collector; + } + + if (cl_gpu_device != nullptr) { + cl_gpu_kernel_collector = ClKernelCollector::Create( + cl_gpu_device, tracer->start_time_, cl_callback, tracer); + if (cl_gpu_kernel_collector == nullptr) { + std::cerr << + "[WARNING] Unable to create kernel collector for CL GPU backend" << + std::endl; + } + tracer->cl_gpu_kernel_collector_ = cl_gpu_kernel_collector; + } + + if (ze_kernel_collector == nullptr && + cl_cpu_kernel_collector == nullptr && + cl_gpu_kernel_collector == nullptr) { + delete tracer; + return nullptr; + } + } + + return tracer; + } + + ~UnifiedTracer() { + std::chrono::steady_clock::time_point end_time = + std::chrono::steady_clock::now(); + std::chrono::duration duration = + end_time - start_time_; + total_execution_time_ = duration.count(); + + if (cl_cpu_api_collector_ != nullptr) { + cl_cpu_api_collector_->DisableTracing(); + } + if (cl_gpu_api_collector_ != nullptr) { + cl_gpu_api_collector_->DisableTracing(); + } + if (ze_api_collector_ != nullptr) { + ze_api_collector_->DisableTracing(); + } + + if (cl_cpu_kernel_collector_ != nullptr) { + cl_cpu_kernel_collector_->DisableTracing(); + } + if (cl_gpu_kernel_collector_ != nullptr) { + cl_gpu_kernel_collector_->DisableTracing(); + } + + Report(); + + if (cl_cpu_api_collector_ != nullptr) { + delete cl_cpu_api_collector_; + } + if (cl_gpu_api_collector_ != nullptr) { + delete cl_gpu_api_collector_; + } + if (ze_api_collector_ != nullptr) { + delete ze_api_collector_; + } + + if (cl_cpu_kernel_collector_ != nullptr) { + delete cl_cpu_kernel_collector_; + } + if (cl_gpu_kernel_collector_ != nullptr) { + delete cl_gpu_kernel_collector_; + } + + if (chrome_trace_.is_open()) { + CloseTraceFile(); + } + } + + bool CheckOption(unsigned option) { + return (options_ & (1 << option)); + } + + UnifiedTracer(const UnifiedTracer& copy) = delete; + UnifiedTracer& operator=(const UnifiedTracer& copy) = delete; + + private: + UnifiedTracer(unsigned options) + : options_(options) { + start_time_ = std::chrono::steady_clock::now(); + + if (CheckOption(ONETRACE_CHROME_DEVICE_TIMELINE) || + CheckOption(ONETRACE_CHROME_CALL_LOGGING)) { + OpenTraceFile(); + } + } + + static uint64_t CalculateTotalTime(const ZeApiCollector* collector) { + PTI_ASSERT(collector != nullptr); + uint64_t total_time = 0; + + const ZeFunctionInfoMap& function_info_map = collector->GetFunctionInfoMap(); + if (function_info_map.size() != 0) { + for (auto& value : function_info_map) { + total_time += value.second.total_time; + } + } + + return total_time; + } + + static uint64_t CalculateTotalTime(const ZeKernelCollector* collector) { + PTI_ASSERT(collector != nullptr); + uint64_t total_time = 0; + + const ZeKernelInfoMap& kernel_info_map = collector->GetKernelInfoMap(); + if (kernel_info_map.size() != 0) { + for (auto& value : kernel_info_map) { + total_time += value.second.total_time; + } + } + + return total_time; + } + + static uint64_t CalculateTotalTime(const ClApiCollector* collector) { + PTI_ASSERT(collector != nullptr); + uint64_t total_time = 0; + + const ClFunctionInfoMap& function_info_map = collector->GetFunctionInfoMap(); + if (function_info_map.size() != 0) { + for (auto& value : function_info_map) { + total_time += value.second.total_time; + } + } + + return total_time; + } + + static uint64_t CalculateTotalTime(const ClKernelCollector* collector) { + PTI_ASSERT(collector != nullptr); + uint64_t total_time = 0; + + const ClKernelInfoMap& kernel_info_map = collector->GetKernelInfoMap(); + if (kernel_info_map.size() != 0) { + for (auto& value : kernel_info_map) { + total_time += value.second.total_time; + } + } + + return total_time; + } + + static void PrintBackendTable( + const ZeApiCollector* collector, const char* device_type) { + PTI_ASSERT(collector != nullptr); + PTI_ASSERT(device_type != nullptr); + + uint64_t total_duration = CalculateTotalTime(collector); + if (total_duration > 0) { + std::cerr << std::endl; + std::cerr << "== " << device_type << " Backend: ==" << std::endl; + std::cerr << std::endl; + + const ZeFunctionInfoMap& function_info_map = collector->GetFunctionInfoMap(); + PTI_ASSERT(function_info_map.size() > 0); + ZeApiCollector::PrintFunctionsTable(function_info_map); + } + } + + static void PrintBackendTable( + const ZeKernelCollector* collector, const char* device_type) { + PTI_ASSERT(collector != nullptr); + PTI_ASSERT(device_type != nullptr); + + uint64_t total_duration = CalculateTotalTime(collector); + if (total_duration > 0) { + std::cerr << std::endl; + std::cerr << "== " << device_type << " Backend: ==" << std::endl; + std::cerr << std::endl; + + const ZeKernelInfoMap& kernel_info_map = collector->GetKernelInfoMap(); + PTI_ASSERT(kernel_info_map.size() > 0); + ZeKernelCollector::PrintKernelsTable(kernel_info_map); + } + } + + static void PrintBackendTable( + const ClApiCollector* collector, const char* device_type) { + PTI_ASSERT(collector != nullptr); + PTI_ASSERT(device_type != nullptr); + + uint64_t total_duration = CalculateTotalTime(collector); + if (total_duration > 0) { + std::cerr << std::endl; + std::cerr << "== " << device_type << " Backend: ==" << std::endl; + std::cerr << std::endl; + + const ClFunctionInfoMap& function_info_map = collector->GetFunctionInfoMap(); + PTI_ASSERT(function_info_map.size() > 0); + ClApiCollector::PrintFunctionsTable(function_info_map); + } + } + + static void PrintBackendTable( + const ClKernelCollector* collector, const char* device_type) { + PTI_ASSERT(collector != nullptr); + PTI_ASSERT(device_type != nullptr); + + uint64_t total_duration = CalculateTotalTime(collector); + if (total_duration > 0) { + std::cerr << std::endl; + std::cerr << "== " << device_type << " Backend: ==" << std::endl; + std::cerr << std::endl; + + const ClKernelInfoMap& kernel_info_map = collector->GetKernelInfoMap(); + PTI_ASSERT(kernel_info_map.size() > 0); + ClKernelCollector::PrintKernelsTable(kernel_info_map); + } + } + + template + void ReportTiming( + const ZeCollector* ze_collector, + const ClCollector* cl_cpu_collector, + const ClCollector* cl_gpu_collector, + const char* type) { + PTI_ASSERT (cl_cpu_collector != nullptr || cl_gpu_collector != nullptr); + + std::string ze_title = + std::string("Total ") + std::string(type) + + " Time for L0 backend (ns): "; + std::string cl_cpu_title = + std::string("Total ") + std::string(type) + + " Time for CL CPU backend (ns): "; + std::string cl_gpu_title = + std::string("Total ") + std::string(type) + + " Time for CL GPU backend (ns): "; + size_t title_width = std::max(cl_cpu_title.size(), cl_gpu_title.size()); + title_width = std::max(title_width, ze_title.size()); + const size_t time_width = 20; + + std::cerr << std::endl; + std::cerr << "=== " << type << " Timing Results: ===" << std::endl; + std::cerr << std::endl; + std::cerr << std::setw(title_width) << "Total Execution Time (ns): " << + std::setw(time_width) << total_execution_time_ << std::endl; + + if (ze_collector != nullptr) { + uint64_t total_time = CalculateTotalTime(ze_collector); + if (total_time > 0) { + std::cerr << std::setw(title_width) << ze_title << + std::setw(time_width) << total_time << + std::endl; + } + } + if (cl_cpu_collector != nullptr) { + uint64_t total_time = CalculateTotalTime(cl_cpu_collector); + if (total_time > 0) { + std::cerr << std::setw(title_width) << cl_cpu_title << + std::setw(time_width) << total_time << + std::endl; + } + } + if (cl_gpu_collector != nullptr) { + uint64_t total_time = CalculateTotalTime(cl_gpu_collector); + if (total_time > 0) { + std::cerr << std::setw(title_width) << cl_gpu_title << + std::setw(time_width) << total_time << + std::endl; + } + } + + if (ze_collector != nullptr) { + PrintBackendTable(ze_collector, "L0"); + } + if (cl_cpu_collector != nullptr) { + PrintBackendTable(cl_cpu_collector, "CL CPU"); + } + if (cl_gpu_collector != nullptr) { + PrintBackendTable(cl_gpu_collector, "CL GPU"); + } + + std::cerr << std::endl; + } + + void Report() { + if (CheckOption(ONETRACE_HOST_TIMING)) { + ReportTiming( + ze_api_collector_, + cl_cpu_api_collector_, + cl_gpu_api_collector_, + "API"); + } + if (CheckOption(ONETRACE_DEVICE_TIMING)) { + ReportTiming( + ze_kernel_collector_, + cl_cpu_kernel_collector_, + cl_gpu_kernel_collector_, + "Device"); + } + std::cerr << std::endl; + } + + static void DeviceTimelineCallback( + void* data, void* queue, const std::string& name, + uint64_t queued, uint64_t submitted, + uint64_t started, uint64_t ended) { + std::stringstream stream; + stream << "Device Timeline (queue: " << queue << + "): " << name << " [ns] = " << + queued << " (queued) " << + submitted << " (submit) " << + started << " (start) " << + ended << " (end)" << std::endl; + std::cerr << stream.str(); + } + + void OpenTraceFile() { + chrome_trace_.open(kChromeTraceFileName); + PTI_ASSERT(chrome_trace_.is_open()); + chrome_trace_ << "[" << std::endl; + chrome_trace_ << + "{\"ph\":\"M\", \"name\":\"process_name\", \"pid\":" << + utils::GetPid() << ", \"tid\":0, \"args\":{\"name\":\"" << + utils::GetExecutableName() << "\"}}," << std::endl; + } + + void CloseTraceFile() { + PTI_ASSERT(chrome_trace_.is_open()); + chrome_trace_.close(); + std::cerr << "Timeline was stored to " << + kChromeTraceFileName << std::endl; + } + + static void ChromeTimelineCallback( + void* data, void* queue, const std::string& name, + uint64_t queued, uint64_t submitted, + uint64_t started, uint64_t ended) { + UnifiedTracer* tracer = reinterpret_cast(data); + PTI_ASSERT(tracer != nullptr); + + std::stringstream stream; + stream << "{\"ph\":\"X\", \"pid\":" << utils::GetPid() << + ", \"tid\":" << reinterpret_cast(queue) << + ", \"name\":\"" << name << + "\", \"ts\": " << started / NSEC_IN_USEC << + ", \"dur\":" << (ended - started) / NSEC_IN_USEC << + "}," << std::endl; + tracer->chrome_trace_ << stream.str(); + } + + static void DeviceAndChromeTimelineCallback( + void* data, void* queue, const std::string& name, + uint64_t queued, uint64_t submitted, + uint64_t started, uint64_t ended) { + DeviceTimelineCallback(data, queue, name, queued, submitted, started, ended); + ChromeTimelineCallback(data, queue, name, queued, submitted, started, ended); + } + + static void ChromeLoggingCallback( + void* data, const std::string& name, + uint64_t started, uint64_t ended) { + UnifiedTracer* tracer = reinterpret_cast(data); + PTI_ASSERT(tracer != nullptr); + + std::stringstream stream; + stream << "{\"ph\":\"X\", \"pid\":" << + utils::GetPid() << ", \"tid\":" << utils::GetTid() << + ", \"name\":\"" << name << + "\", \"ts\": " << started / NSEC_IN_USEC << + ", \"dur\":" << (ended - started) / NSEC_IN_USEC << + "}," << std::endl; + tracer->chrome_trace_ << stream.str(); + } + + private: + unsigned options_; + + std::chrono::time_point start_time_; + uint64_t total_execution_time_ = 0; + + ZeApiCollector* ze_api_collector_ = nullptr; + ClApiCollector* cl_cpu_api_collector_ = nullptr; + ClApiCollector* cl_gpu_api_collector_ = nullptr; + + ZeKernelCollector* ze_kernel_collector_ = nullptr; + ClKernelCollector* cl_cpu_kernel_collector_ = nullptr; + ClKernelCollector* cl_gpu_kernel_collector_ = nullptr; + + std::ofstream chrome_trace_; +}; + +#endif // PTI_SAMPLES_ONETRACE_UNIFIED_TRACER_H_ \ No newline at end of file