diff --git a/VERSION b/VERSION index 2907e54..e5d36c5 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.42.1 \ No newline at end of file +0.42.4 \ No newline at end of file diff --git a/build_utils/CMakeLists.txt b/build_utils/CMakeLists.txt index cb70497..0b886c6 100644 --- a/build_utils/CMakeLists.txt +++ b/build_utils/CMakeLists.txt @@ -166,6 +166,7 @@ macro(FindIGALibrary TARGET) "IGA is not found. " "You may need to install Intel(R) Processor Graphics Driver to fix this issue.") else() + list(GET IGA_DLL_PATH 0 IGA_DLL_PATH) message(STATUS "IGA is found at ${IGA_DLL_PATH}") find_library(IGA_LIB_PATH diff --git a/loader/loader.cc b/loader/loader.cc index 98aa14e..b23960d 100644 --- a/loader/loader.cc +++ b/loader/loader.cc @@ -63,12 +63,10 @@ static bool CheckBitness(HANDLE parent, HANDLE child) { int main(int argc, char* argv[]) { std::string library_file_name = GetLibFileName(); std::string executable_path = utils::GetExecutablePath(); - std::string library_file_path = executable_path + library_file_name; + std::string library_file_path = executable_path + library_file_name; if (!IsFileExists(library_file_path.c_str())) { - std::cout << "[ERROR] Failed to find " << - library_file_name << " near the loader" << std::endl; - return 0; + library_file_path = library_file_name; } SharedLibrary* lib = SharedLibrary::Create(library_file_path); diff --git a/tests/run.py b/tests/run.py index 7a0fe51..e706dcf 100644 --- a/tests/run.py +++ b/tests/run.py @@ -35,6 +35,7 @@ ["sysmon", "-p", "-l", "-d"], ["onetrace", "-c", "-h", "-d", "-v", "-t", "-s", + "--demangle", "--kernels-per-tile", "--chrome-call-logging", "--chrome-device-timeline", @@ -44,6 +45,7 @@ "cl", "ze", "omp"], ["cl_tracer", "-c", "-h", "-d", "-v", "-t", "-s", + "--demangle", "--chrome-call-logging", "--chrome-device-timeline", "--chrome-kernel-timeline", @@ -52,6 +54,7 @@ "gpu", "dpc", "omp"], ["ze_tracer", "-c", "-h", "-d", "-v", "-t", "-s", + "--demangle", "--kernels-per-tile", "--chrome-call-logging", "--chrome-device-timeline", diff --git a/tests/tools/cl_tracer.py b/tests/tools/cl_tracer.py index cefdebd..50f83b6 100644 --- a/tests/tools/cl_tracer.py +++ b/tests/tools/cl_tracer.py @@ -51,7 +51,7 @@ def run(path, option): if option == "gpu": command = [file_name_prefix + "cl_tracer" + file_extention,\ "-h", "-d", "-t", app_file, "gpu", "1024", "1"] - elif option == "-v" or option == "--conditional-collection": + elif option == "-v" or option == "--demangle" or option == "--conditional-collection": command = [file_name_prefix + "cl_tracer" + file_extention,\ "-d", option, app_file, "cpu", "1024", "1"] else: @@ -102,6 +102,8 @@ def main(option): option = "-s" if len(sys.argv) > 1 and sys.argv[1] == "-v": option = "-v" + if len(sys.argv) > 1 and sys.argv[1] == "--demangle": + option = "--demangle" if len(sys.argv) > 1 and sys.argv[1] == "--chrome-call-logging": option = "--chrome-call-logging" if len(sys.argv) > 1 and sys.argv[1] == "--chrome-device-timeline": diff --git a/tests/tools/onetrace.py b/tests/tools/onetrace.py index 73df1c1..63c293b 100644 --- a/tests/tools/onetrace.py +++ b/tests/tools/onetrace.py @@ -35,7 +35,8 @@ def run(path, option): app_folder = utils.get_sample_executable_path("omp_gemm") app_file = os.path.join(app_folder, "omp_gemm") command = ["./onetrace", "-h", "-d", "-t", app_file, "gpu", "1024", "1"] - elif option == "-v" or option == "--kernels-per-tile" or option == "--conditional-collection": + elif option == "-v" or option == "--demangle" or\ + option == "--kernels-per-tile" or option == "--conditional-collection": app_folder = utils.get_sample_executable_path("dpc_gemm") app_file = os.path.join(app_folder, "dpc_gemm") command = ["./onetrace", "-d", option, app_file, "gpu", "1024", "1"] @@ -88,6 +89,8 @@ def main(option): option = "-s" if len(sys.argv) > 1 and sys.argv[1] == "-v": option = "-v" + if len(sys.argv) > 1 and sys.argv[1] == "--demangle": + option = "--demangle" if len(sys.argv) > 1 and sys.argv[1] == "--kernels-per-tile": option = "--kernels-per-tile" if len(sys.argv) > 1 and sys.argv[1] == "--chrome-call-logging": diff --git a/tests/tools/ze_tracer.py b/tests/tools/ze_tracer.py index 08a6852..35d346b 100644 --- a/tests/tools/ze_tracer.py +++ b/tests/tools/ze_tracer.py @@ -30,7 +30,8 @@ def run(path, option): app_folder = utils.get_sample_executable_path("omp_gemm") app_file = os.path.join(app_folder, "omp_gemm") command = ["./ze_tracer", "-h", "-d", "-t", app_file, "gpu", "1024", "1"] - elif option == "-v" or option == "--kernels-per-tile" or option == "--conditional-collection": + elif option == "-v" or option == "--demangle" or\ + option == "--kernels-per-tile" or option == "--conditional-collection": app_folder = utils.get_sample_executable_path("ze_gemm") app_file = os.path.join(app_folder, "ze_gemm") command = ["./ze_tracer", "-d", option, app_file, "1024", "1"] @@ -81,6 +82,8 @@ def main(option): option = "-s" if len(sys.argv) > 1 and sys.argv[1] == "-v": option = "-v" + if len(sys.argv) > 1 and sys.argv[1] == "--demangle": + option = "--demangle" if len(sys.argv) > 1 and sys.argv[1] == "--kernels-per-tile": option = "--kernels-per-tile" if len(sys.argv) > 1 and sys.argv[1] == "--chrome-call-logging": diff --git a/tools/cl_tracer/README.md b/tools/cl_tracer/README.md index 617baa2..e07a25c 100644 --- a/tools/cl_tracer/README.md +++ b/tools/cl_tracer/README.md @@ -16,6 +16,7 @@ Options: --chrome-kernel-timeline Dump device activities to JSON file per kernel name --chrome-device-stages Dump device activities by stages to JSON file --verbose [-v] Enable verbose mode to show more kernel information +--demangle Demangle DPC++ kernel names --tid Print thread ID into host API trace --pid Print process ID into host API and device activity trace --output [-o] Print console logs into the file diff --git a/tools/cl_tracer/cl_api_callbacks.h b/tools/cl_tracer/cl_api_callbacks.h index eebaeda..4aa5720 100644 --- a/tools/cl_tracer/cl_api_callbacks.h +++ b/tools/cl_tracer/cl_api_callbacks.h @@ -2931,7 +2931,8 @@ static void clEnqueueTaskOnEnter( stream << " commandQueue = " << *(params->commandQueue); stream << " kernel = " << *(params->kernel); if (*(params->kernel) != nullptr) { - std::string kernel_name = utils::cl::GetKernelName(*(params->kernel)); + std::string kernel_name = utils::cl::GetKernelName( + *(params->kernel), collector->Demangle()); if (!kernel_name.empty()) { stream << " (" << kernel_name << ")"; } @@ -3582,7 +3583,8 @@ static void clEnqueueNDRangeKernelOnEnter( stream << " commandQueue = " << *(params->commandQueue); stream << " kernel = " << *(params->kernel); if (*(params->kernel) != nullptr) { - std::string kernel_name = utils::cl::GetKernelName(*(params->kernel)); + std::string kernel_name = utils::cl::GetKernelName( + *(params->kernel), collector->Demangle()); if (!kernel_name.empty()) { stream << " (" << kernel_name << ")"; } @@ -5966,6 +5968,9 @@ static void clCreateKernelOnEnter( stream << " kernelName = \"\""; } else { stream << " kernelName = \"" << *(params->kernelName) << "\""; + if (collector->Demangle()) { + stream << " (" << utils::Demangle(*(params->kernelName)) << ")"; + } } stream << " errcodeRet = " << *(params->errcodeRet); stream << std::endl; diff --git a/tools/cl_tracer/cl_api_collector.h b/tools/cl_tracer/cl_api_collector.h index dfcf568..05c1d70 100644 --- a/tools/cl_tracer/cl_api_collector.h +++ b/tools/cl_tracer/cl_api_collector.h @@ -60,7 +60,7 @@ class ClApiCollector { static ClApiCollector* Create( cl_device_id device, Correlator* correlator, - ApiCollectorOptions options = {false, false, false}, + ApiCollectorOptions options, OnClFunctionFinishCallback callback = nullptr, void* callback_data = nullptr) { PTI_ASSERT(device != nullptr); @@ -115,6 +115,10 @@ class ClApiCollector { return options_.need_pid; } + bool Demangle() const { + return options_.demangle; + } + void Log(const std::string& text) { PTI_ASSERT(correlator_ != nullptr); correlator_->Log(text); @@ -332,7 +336,7 @@ class ClApiCollector { ClApiTracer* tracer_ = nullptr; Correlator* correlator_ = nullptr; - ApiCollectorOptions options_ = {false, false, false}; + ApiCollectorOptions options_; cl_device_type device_type_ = CL_DEVICE_TYPE_ALL; OnClFunctionFinishCallback callback_ = nullptr; diff --git a/tools/cl_tracer/cl_kernel_collector.h b/tools/cl_tracer/cl_kernel_collector.h index 91e22d9..5e9dd0f 100644 --- a/tools/cl_tracer/cl_kernel_collector.h +++ b/tools/cl_tracer/cl_kernel_collector.h @@ -113,7 +113,7 @@ class ClKernelCollector { static ClKernelCollector* Create( cl_device_id device, Correlator* correlator, - bool verbose, + KernelCollectorOptions options, OnClKernelFinishCallback callback = nullptr, void* callback_data = nullptr) { PTI_ASSERT(device != nullptr); @@ -121,7 +121,7 @@ class ClKernelCollector { TraceGuard guard; ClKernelCollector* collector = new ClKernelCollector( - device, correlator, verbose, callback, callback_data); + device, correlator, options, callback, callback_data); PTI_ASSERT(collector != nullptr); ClApiTracer* tracer = new ClApiTracer(device, Callback, collector); @@ -281,12 +281,12 @@ class ClKernelCollector { ClKernelCollector( cl_device_id device, Correlator* correlator, - bool verbose, + KernelCollectorOptions options, OnClKernelFinishCallback callback, void* callback_data) : device_(device), correlator_(correlator), - verbose_(verbose), + options_(options), callback_(callback), callback_data_(callback_data), kernel_id_(1) { @@ -443,6 +443,13 @@ class ClKernelCollector { PTI_ASSERT(device != nullptr); AddKernelInterval(instance, device, started, ended); #else // PTI_KERNEL_INTERVALS + std::string name = instance->props.name; + PTI_ASSERT(!name.empty()); + + if (options_.verbose) { + name = GetVerboseName(&(instance->props)); + } + uint64_t host_queued = 0, host_submitted = 0; uint64_t host_started = 0, host_ended = 0; ComputeHostTimestamps( @@ -451,19 +458,12 @@ class ClKernelCollector { host_queued, host_submitted, host_started, host_ended); AddKernelInfo( - &instance->props, + name, host_submitted - host_queued, host_started - host_submitted, host_ended - host_started); if (callback_ != nullptr) { - std::string name = instance->props.name; - PTI_ASSERT(!name.empty()); - - if (verbose_) { - name = GetVerboseName(&(instance->props)); - } - std::stringstream stream; stream << std::hex << queue; @@ -551,17 +551,10 @@ class ClKernelCollector { } void AddKernelInfo( - const ClKernelProps* props, uint64_t queued_time, + std::string name, uint64_t queued_time, uint64_t submit_time, uint64_t execute_time) { - PTI_ASSERT(props != nullptr); - - std::string name = props->name; PTI_ASSERT(!name.empty()); - if (verbose_) { - name = GetVerboseName(props); - } - if (kernel_info_map_.count(name) == 0) { ClKernelInfo info; info.queued_time = queued_time; @@ -606,7 +599,7 @@ class ClKernelCollector { std::string name = instance->props.name; PTI_ASSERT(!name.empty()); - if (verbose_) { + if (options_.verbose) { name = GetVerboseName(&instance->props); } @@ -755,11 +748,9 @@ class ClKernelCollector { cl_ulong host_timestamp = 0; utils::cl::GetTimestamps( collector->device_, &enqueue_data->host_sync, &enqueue_data->device_sync); -#ifndef PTI_KERNEL_INTERVALS PTI_ASSERT(collector->correlator_ != nullptr); enqueue_data->host_sync = collector->correlator_->GetTimestamp(enqueue_data->host_sync); -#endif const T* params = reinterpret_cast(data->functionParams); PTI_ASSERT(params != nullptr); @@ -797,7 +788,8 @@ class ClKernelCollector { instance->event = **(params->event); cl_kernel kernel = *(params->kernel); - instance->props.name = utils::cl::GetKernelName(kernel); + instance->props.name = utils::cl::GetKernelName( + kernel, collector->options_.demangle); cl_command_queue queue = *(params->commandQueue); PTI_ASSERT(queue != nullptr); @@ -1401,7 +1393,7 @@ class ClKernelCollector { ClApiTracer* tracer_ = nullptr; Correlator* correlator_ = nullptr; - bool verbose_ = false; + KernelCollectorOptions options_; std::atomic kernel_id_; cl_device_id device_ = nullptr; diff --git a/tools/cl_tracer/cl_tracer.h b/tools/cl_tracer/cl_tracer.h index acda684..18dbe9e 100644 --- a/tools/cl_tracer/cl_tracer.h +++ b/tools/cl_tracer/cl_tracer.h @@ -81,11 +81,14 @@ class ClTracer { callback = ChromeStagesCallback; } + KernelCollectorOptions kernel_options; + kernel_options.verbose = tracer->CheckOption(TRACE_VERBOSE); + kernel_options.demangle = tracer->CheckOption(TRACE_DEMANGLE); + if (cpu_device != nullptr) { cpu_kernel_collector = ClKernelCollector::Create( cpu_device, &tracer->correlator_, - tracer->CheckOption(TRACE_VERBOSE), - callback, tracer); + kernel_options, callback, tracer); if (cpu_kernel_collector == nullptr) { std::cerr << "[WARNING] Unable to create kernel collector for CPU backend" << @@ -97,8 +100,7 @@ class ClTracer { if (gpu_device != nullptr) { gpu_kernel_collector = ClKernelCollector::Create( gpu_device, &tracer->correlator_, - tracer->CheckOption(TRACE_VERBOSE), - callback, tracer); + kernel_options, callback, tracer); if (gpu_kernel_collector == nullptr) { std::cerr << "[WARNING] Unable to create kernel collector for GPU backend" << @@ -125,15 +127,16 @@ class ClTracer { callback = ChromeLoggingCallback; } - ApiCollectorOptions cl_api_options{false, false, false}; - cl_api_options.call_tracing = tracer->CheckOption(TRACE_CALL_LOGGING); - cl_api_options.need_tid = tracer->CheckOption(TRACE_TID); - cl_api_options.need_pid = tracer->CheckOption(TRACE_PID); + ApiCollectorOptions api_options; + api_options.call_tracing = tracer->CheckOption(TRACE_CALL_LOGGING); + api_options.need_tid = tracer->CheckOption(TRACE_TID); + api_options.need_pid = tracer->CheckOption(TRACE_PID); + api_options.demangle = tracer->CheckOption(TRACE_DEMANGLE); if (cpu_device != nullptr) { cpu_api_collector = ClApiCollector::Create( cpu_device, &tracer->correlator_, - cl_api_options, callback, tracer); + api_options, callback, tracer); if (cpu_api_collector == nullptr) { std::cerr << "[WARNING] Unable to create API collector for CPU backend" << @@ -145,7 +148,7 @@ class ClTracer { if (gpu_device != nullptr) { gpu_api_collector = ClApiCollector::Create( gpu_device, &tracer->correlator_, - cl_api_options, callback, tracer); + api_options, callback, tracer); if (gpu_api_collector == nullptr) { std::cerr << "[WARNING] Unable to create API collector for GPU backend" << diff --git a/tools/cl_tracer/tool.cc b/tools/cl_tracer/tool.cc index ca90b09..be596bd 100644 --- a/tools/cl_tracer/tool.cc +++ b/tools/cl_tracer/tool.cc @@ -60,6 +60,10 @@ void Usage() { "--verbose [-v] " << "Enable verbose mode to show more kernel information" << std::endl; + std::cout << + "--demangle " << + "Demangle DPC++ kernel names" << + std::endl; std::cout << "--tid " << "Print thread ID into host API trace" << @@ -125,6 +129,9 @@ int ParseArgs(int argc, char* argv[]) { strcmp(argv[i], "-v") == 0) { utils::SetEnv("CLT_Verbose", "1"); ++app_index; + } else if (strcmp(argv[i], "--demangle") == 0) { + utils::SetEnv("CLT_Demangle", "1"); + ++app_index; } else if (strcmp(argv[i], "--tid") == 0) { utils::SetEnv("CLT_Tid", "1"); ++app_index; @@ -235,6 +242,11 @@ static TraceOptions ReadArgs() { flags |= (1 << TRACE_VERBOSE); } + value = utils::GetEnv("CLT_Demangle"); + if (!value.empty() && value == "1") { + flags |= (1 << TRACE_DEMANGLE); + } + value = utils::GetEnv("CLT_Tid"); if (!value.empty() && value == "1") { flags |= (1 << TRACE_TID); diff --git a/tools/oneprof/README.md b/tools/oneprof/README.md index f0b7983..78870e5 100644 --- a/tools/oneprof/README.md +++ b/tools/oneprof/README.md @@ -24,15 +24,12 @@ Options: ``` == Raw Metrics == -SubDeviceId,GpuTime,GpuCoreClocks,AvgGpuCoreFrequencyMHz,GpuBusy,VsThreads,HsThreads,DsThreads,GsThreads,PsThreads,CsThreads,EuActive,EuStall,EuFpuBothActive,Fpu0Active,Fpu1Active,EuAvgIpcRate,EuSendActive,EuThreadOccupancy,RasterizedPixels,HiDepthTestFails,EarlyDepthTestFails,SamplesKilledInPs,PixelsFailingPostPsTests,SamplesWritten,SamplesBlended,SamplerTexels,SamplerTexelMisses,SlmBytesRead,SlmBytesWritten,ShaderMemoryAccesses,ShaderAtomics,L3ShaderThroughput,ShaderBarriers,TypedBytesRead,TypedBytesWritten,UntypedBytesRead,UntypedBytesWritten,GtiReadThroughput,GtiWriteThroughput,QueryBeginTime,CoreFrequencyMHz,EuSliceFrequencyMHz,ReportReason,ContextId,StreamMarker, -0,682666,759081,1111,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,64,768,78019584000,1149,1149,1,1048575,69165123, -0,682666,783253,1147,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,384,78020266666,1149,1149,1,1048575,69165123, -0,682666,783231,1147,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,256,78020949333,1149,1149,1,1048575,69165123, -0,682666,783242,1147,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,256,78021632000,1149,1149,1,1048575,69165123, -0,671166,770086,1147,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,256,78022303166,1149,1149,8,32,69165123, -0,11500,13167,1144,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9408,704,78022314666,1149,1149,1,32,69165123, -0,682666,783231,1147,99.2111,0,0,0,0,0,16720,8.7586,88.6453,2.93694,6.15843,4.01323,1.40595,1.95425,89.3107,0,0,0,0,0,0,0,0,0,0,0,555467,0,35549888,0,0,0,27271424,8444928,9089280,8205056,78022997333,1149,1149,1,32,69165123, -0,682666,783230,1147,100,0,0,0,0,0,336,72.8647,27.1342,35.0761,51.0768,52.328,1.51334,11.8151,99.9455,0,0,0,0,0,0,0,0,0,0,0,2220109,0,142086976,0,0,0,142055680,43008,6853120,228096,78023680000,1149,1149,1,32,69165123, +SubDeviceId,HostTimestamp,GpuTime,GpuCoreClocks,AvgGpuCoreFrequencyMHz,GpuBusy,VsThreads,HsThreads,DsThreads,GsThreads,PsThreads,CsThreads,EuActive,EuStall,EuFpuBothActive,Fpu0Active,Fpu1Active,EuAvgIpcRate,EuSendActive,EuThreadOccupancy,RasterizedPixels,HiDepthTestFails,EarlyDepthTestFails,SamplesKilledInPs,PixelsFailingPostPsTests,SamplesWritten,SamplesBlended,SamplerTexels,SamplerTexelMisses,SlmBytesRead,SlmBytesWritten,ShaderMemoryAccesses,ShaderAtomics,L3ShaderThroughput,ShaderBarriers,TypedBytesRead,TypedBytesWritten,UntypedBytesRead,UntypedBytesWritten,GtiReadThroughput,GtiWriteThroughput,QueryBeginTime,CoreFrequencyMHz,EuSliceFrequencyMHz,ReportReason,ContextId,StreamMarker, +0,343304199,682666,753618,1103,100,0,0,0,0,0,336,56.1465,43.8535,22.9954,36.4886,36.9501,1.45587,10.8754,99.9425,0,0,0,0,0,0,0,0,0,0,0,1966165,0,125834560,0,0,0,125785344,43008,6073216,44032,238066688000,1149,1149,1,32,1179654540, +0,343986865,682666,773375,1132,100,0,0,0,0,0,672,67.1219,32.8779,18.8542,40.5393,39.8573,1.30636,11.9009,99.8918,0,0,0,0,0,0,0,0,0,0,0,2207249,0,141263936,0,0,0,141191680,86016,6814912,43520,238067370666,1149,1149,1,32,1179654540, +0,344669532,682666,783253,1147,100,0,0,0,0,0,336,67.4071,32.5929,19.2494,40.8147,40.1636,1.31184,11.9925,99.9399,0,0,0,0,0,0,0,0,0,0,0,2253502,0,144224128,0,0,0,144171136,43008,6954560,86272,238068053333,1149,1149,1,32,1179654540, +0,345352199,682666,773433,1132,100,0,0,0,0,0,672,66.4875,33.5125,18.8932,40.1962,39.5619,1.31041,11.8066,99.8886,0,0,0,0,0,0,0,0,0,0,0,2189916,0,140154624,0,0,0,140070272,86016,6761088,86528,238068736000,1149,1149,1,32,1179654540, +0,346034865,682666,773375,1132,100,0,0,0,0,0,336,66.2652,33.7344,17.3913,39.4322,38.6375,1.28662,11.561,99.9126,0,0,0,0,0,0,0,0,0,0,0,2144985,0,137279040,0,0,0,137233664,43008,6620416,43520,238069418666,1149,1149,1,32,1179654540, ... ``` @@ -40,13 +37,17 @@ SubDeviceId,GpuTime,GpuCoreClocks,AvgGpuCoreFrequencyMHz,GpuBusy,VsThreads,HsThr ``` == Raw Kernel Intervals (Level Zero) == -SubDeviceId,Name,Start,End, -0,zeCommandListAppendMemoryCopy[4194304 bytes],333082304833,333082625166, -0,zeCommandListAppendMemoryCopy[4194304 bytes],333082625333,333082914166, -0,zeCommandListAppendBarrier,333082914333,333082915999, -0,GEMM[SIMD32 {4; 1024; 1} {256; 1; 1}],333082916166,333125617666, -0,zeCommandListAppendBarrier,333125617833,333125618999, -0,zeCommandListAppendMemoryCopy[4194304 bytes],333125619166,333126081832, +Kernel,zeCommandListAppendMemoryCopy(M2D)[4194304 bytes], +SubDeviceId,Start,End, +0,106673485,106963818, + +Kernel,zeCommandListAppendBarrier, +SubDeviceId,Start,End, +0,106963985,106965318, + +Kernel,GEMM[SIMD32, {4; 1024; 1} {256; 1; 1}], +SubDeviceId,Start,End, +0,106965485,149496985, ... ``` @@ -54,16 +55,14 @@ SubDeviceId,Name,Start,End, ``` == Kernel Metrics (OpenCL) == -Kernel,clEnqueueWriteBuffer[4194304 bytes], -SubDeviceId,GpuTime,GpuCoreClocks,AvgGpuCoreFrequencyMHz,GpuBusy,VsThreads,HsThreads,DsThreads,GsThreads,PsThreads,CsThreads,EuActive,EuStall,EuFpuBothActive,Fpu0Active,Fpu1Active,EuAvgIpcRate,EuSendActive,EuThreadOccupancy,RasterizedPixels,HiDepthTestFails,EarlyDepthTestFails,SamplesKilledInPs,PixelsFailingPostPsTests,SamplesWritten,SamplesBlended,SamplerTexels,SamplerTexelMisses,SlmBytesRead,SlmBytesWritten,ShaderMemoryAccesses,ShaderAtomics,L3ShaderThroughput,ShaderBarriers,TypedBytesRead,TypedBytesWritten,UntypedBytesRead,UntypedBytesWritten,GtiReadThroughput,GtiWriteThroughput,QueryBeginTime,CoreFrequencyMHz,EuSliceFrequencyMHz,ReportReason,ContextId,StreamMarker, -0,374750,364111,971,93.864,0,0,0,0,0,6280,2.71498,90.43,0.0452701,1.86351,0.0731233,1.02394,0.860377,85.2366,0,0,0,0,0,0,0,0,0,0,0,145957,0,9341248,0,0,0,6262528,3028480,3227200,2642816,259520606666,1149,1149,8,32,0, +Kernel,clEnqueueWriteBufferclEnqueueWriteBuffer[4194304 bytes], +SubDeviceId,HostTimestamp,GpuTime,GpuCoreClocks,AvgGpuCoreFrequencyMHz,GpuBusy,VsThreads,HsThreads,DsThreads,GsThreads,PsThreads,CsThreads,EuActive,EuStall,EuFpuBothActive,Fpu0Active,Fpu1Active,EuAvgIpcRate,EuSendActive,EuThreadOccupancy,RasterizedPixels,HiDepthTestFails,EarlyDepthTestFails,SamplesKilledInPs,PixelsFailingPostPsTests,SamplesWritten,SamplesBlended,SamplerTexels,SamplerTexelMisses,SlmBytesRead,SlmBytesWritten,ShaderMemoryAccesses,ShaderAtomics,L3ShaderThroughput,ShaderBarriers,TypedBytesRead,TypedBytesWritten,UntypedBytesRead,UntypedBytesWritten,GtiReadThroughput,GtiWriteThroughput,QueryBeginTime,CoreFrequencyMHz,EuSliceFrequencyMHz,ReportReason,ContextId,StreamMarker, +0,342796169,248333,260664,1049,89.4032,0,0,0,0,0,5000,2.97376,84.5846,0.0751366,2.04254,0.111094,1.03615,0.946629,79.7974,0,0,0,0,0,0,0,0,0,0,0,115437,0,7387968,0,0,0,4883072,2340224,2575616,2002560,29069653333,1149,1149,1,32,1179654540, Kernel,GEMM[SIMD32, {1024, 1024, 1}, {0, 0, 0}], -SubDeviceId,GpuTime,GpuCoreClocks,AvgGpuCoreFrequencyMHz,GpuBusy,VsThreads,HsThreads,DsThreads,GsThreads,PsThreads,CsThreads,EuActive,EuStall,EuFpuBothActive,Fpu0Active,Fpu1Active,EuAvgIpcRate,EuSendActive,EuThreadOccupancy,RasterizedPixels,HiDepthTestFails,EarlyDepthTestFails,SamplesKilledInPs,PixelsFailingPostPsTests,SamplesWritten,SamplesBlended,SamplerTexels,SamplerTexelMisses,SlmBytesRead,SlmBytesWritten,ShaderMemoryAccesses,ShaderAtomics,L3ShaderThroughput,ShaderBarriers,TypedBytesRead,TypedBytesWritten,UntypedBytesRead,UntypedBytesWritten,GtiReadThroughput,GtiWriteThroughput,QueryBeginTime,CoreFrequencyMHz,EuSliceFrequencyMHz,ReportReason,ContextId,StreamMarker, -0,246666,248846,1008,100,0,0,0,0,0,2248,29.8595,62.0995,12.2179,19.2157,19.4196,1.46249,6.01644,88.0508,0,0,0,0,0,0,0,0,0,0,0,382460,0,24477440,0,0,0,23375744,1162752,2120704,1556480,259520853333,1149,1149,1,32,0, -0,682666,783231,1147,100,0,0,0,0,0,336,64.3564,35.6436,22.5158,39.9458,41.0742,1.38486,11.9982,99.9483,0,0,0,0,0,0,0,0,0,0,0,2254528,0,144289792,0,0,0,144233472,43008,6960768,43264,259521536000,1149,1149,1,32,0, -0,682666,759046,1111,100,0,0,0,0,0,672,64.7454,35.2543,18.8366,39.1944,38.9525,1.31759,11.5678,99.8654,0,0,0,0,0,0,0,0,0,0,0,2105639,0,134760896,0,0,0,134665472,86016,6498496,86784,259522218666,1149,1149,1,32,0, -0,682666,783230,1147,100,0,0,0,0,0,672,65.9991,34.0009,18.9462,39.8569,39.5415,1.31341,11.7531,99.871,0,0,0,0,0,0,0,0,0,0,0,2207611,0,141287104,0,0,0,141199616,86016,6814400,43264,259522901333,1149,1149,1,32,0, +SubDeviceId,HostTimestamp,GpuTime,GpuCoreClocks,AvgGpuCoreFrequencyMHz,GpuBusy,VsThreads,HsThreads,DsThreads,GsThreads,PsThreads,CsThreads,EuActive,EuStall,EuFpuBothActive,Fpu0Active,Fpu1Active,EuAvgIpcRate,EuSendActive,EuThreadOccupancy,RasterizedPixels,HiDepthTestFails,EarlyDepthTestFails,SamplesKilledInPs,PixelsFailingPostPsTests,SamplesWritten,SamplesBlended,SamplerTexels,SamplerTexelMisses,SlmBytesRead,SlmBytesWritten,ShaderMemoryAccesses,ShaderAtomics,L3ShaderThroughput,ShaderBarriers,TypedBytesRead,TypedBytesWritten,UntypedBytesRead,UntypedBytesWritten,GtiReadThroughput,GtiWriteThroughput,QueryBeginTime,CoreFrequencyMHz,EuSliceFrequencyMHz,ReportReason,ContextId,StreamMarker, +0,344161502,590666,653470,1106,95.8942,0,0,0,0,0,672,52.4014,42.434,23.2914,34.7129,35.3936,1.49752,10.3669,94.677,0,0,0,0,0,0,0,0,0,0,0,1624753,0,103984192,0,0,0,103949056,43008,5054976,20736,29071018666,1149,1149,1,32,1179654540, +0,344844169,682666,783253,1147,100,0,0,0,0,0,336,69.2735,30.7265,19.4396,41.9169,41.3087,1.30476,12.3252,99.9455,0,0,0,0,0,0,0,0,0,0,0,2316056,0,148227584,0,0,0,148178304,43008,7149248,67328,29071701333,1149,1149,1,32,1179654540, ... ``` @@ -71,13 +70,13 @@ SubDeviceId,GpuTime,GpuCoreClocks,AvgGpuCoreFrequencyMHz,GpuBusy,VsThreads,HsThr ``` == Aggregated Metrics (Level Zero) == -Kernel,zeCommandListAppendMemoryCopy[4194304 bytes], -SubDeviceId,GpuTime,GpuCoreClocks,AvgGpuCoreFrequencyMHz,GpuBusy,VsThreads,HsThreads,DsThreads,GsThreads,PsThreads,CsThreads,EuActive,EuStall,EuFpuBothActive,Fpu0Active,Fpu1Active,EuAvgIpcRate,EuSendActive,EuThreadOccupancy,RasterizedPixels,HiDepthTestFails,EarlyDepthTestFails,SamplesKilledInPs,PixelsFailingPostPsTests,SamplesWritten,SamplesBlended,SamplerTexels,SamplerTexelMisses,SlmBytesRead,SlmBytesWritten,ShaderMemoryAccesses,ShaderAtomics,L3ShaderThroughput,ShaderBarriers,TypedBytesRead,TypedBytesWritten,UntypedBytesRead,UntypedBytesWritten,GtiReadThroughput,GtiWriteThroughput,QueryBeginTime,CoreFrequencyMHz,EuSliceFrequencyMHz,ReportReason,ContextId,StreamMarker, -0,105416,103059,977,80.2298,0,0,0,0,0,1821,2.8716,62.56,0.0550251,1.97421,0.135966,1.02677,0.785917,58.3937,0,0,0,0,0,0,0,0,0,0,0,38684,0,2475776,0,0,0,1713920,776704,900288,370112,344182442666,1099,1099,1,32,428071356, +Kernel,zeCommandListAppendMemoryCopy(M2D)[4194304 bytes], +SubDeviceId,HostTimestamp,GpuTime,GpuCoreClocks,AvgGpuCoreFrequencyMHz,GpuBusy,VsThreads,HsThreads,DsThreads,GsThreads,PsThreads,CsThreads,EuActive,EuStall,EuFpuBothActive,Fpu0Active,Fpu1Active,EuAvgIpcRate,EuSendActive,EuThreadOccupancy,RasterizedPixels,HiDepthTestFails,EarlyDepthTestFails,SamplesKilledInPs,PixelsFailingPostPsTests,SamplesWritten,SamplesBlended,SamplerTexels,SamplerTexelMisses,SlmBytesRead,SlmBytesWritten,ShaderMemoryAccesses,ShaderAtomics,L3ShaderThroughput,ShaderBarriers,TypedBytesRead,TypedBytesWritten,UntypedBytesRead,UntypedBytesWritten,GtiReadThroughput,GtiWriteThroughput,QueryBeginTime,CoreFrequencyMHz,EuSliceFrequencyMHz,ReportReason,ContextId,StreamMarker, +0,162137469,581750,565065,971,96.6676,0,0,0,0,0,10243,3.12565,88.8954,0.0217637,2.09571,0.0898717,1.01006,0.91929,82.8795,0,0,0,0,0,0,0,0,0,0,0,240969,0,15422016,0,0,0,9946880,4878464,5353472,4689728,70536192000,1149,1149,1,32,1179654540, -Kernel,GEMM[SIMD32 {4; 1024; 1} {256; 1; 1}], -SubDeviceId,GpuTime,GpuCoreClocks,AvgGpuCoreFrequencyMHz,GpuBusy,VsThreads,HsThreads,DsThreads,GsThreads,PsThreads,CsThreads,EuActive,EuStall,EuFpuBothActive,Fpu0Active,Fpu1Active,EuAvgIpcRate,EuSendActive,EuThreadOccupancy,RasterizedPixels,HiDepthTestFails,EarlyDepthTestFails,SamplesKilledInPs,PixelsFailingPostPsTests,SamplesWritten,SamplesBlended,SamplerTexels,SamplerTexelMisses,SlmBytesRead,SlmBytesWritten,ShaderMemoryAccesses,ShaderAtomics,L3ShaderThroughput,ShaderBarriers,TypedBytesRead,TypedBytesWritten,UntypedBytesRead,UntypedBytesWritten,GtiReadThroughput,GtiWriteThroughput,QueryBeginTime,CoreFrequencyMHz,EuSliceFrequencyMHz,ReportReason,ContextId,StreamMarker, -0,43690626,49697972,1137,100,0,0,0,0,0,47331,73.4137,26.532,28.8329,49.2144,48.9965,1.41675,11.2316,99.6994,0,0,0,0,0,0,0,0,0,0,0,134052413,0,8579354432,0,0,0,8568335616,11876864,420841984,12214336,344183127166,1149,1149,1,32,428071356, +Kernel,GEMM[SIMD32, {4; 1024; 1} {256; 1; 1}], +SubDeviceId,HostTimestamp,GpuTime,GpuCoreClocks,AvgGpuCoreFrequencyMHz,GpuBusy,VsThreads,HsThreads,DsThreads,GsThreads,PsThreads,CsThreads,EuActive,EuStall,EuFpuBothActive,Fpu0Active,Fpu1Active,EuAvgIpcRate,EuSendActive,EuThreadOccupancy,RasterizedPixels,HiDepthTestFails,EarlyDepthTestFails,SamplesKilledInPs,PixelsFailingPostPsTests,SamplesWritten,SamplesBlended,SamplerTexels,SamplerTexelMisses,SlmBytesRead,SlmBytesWritten,ShaderMemoryAccesses,ShaderAtomics,L3ShaderThroughput,ShaderBarriers,TypedBytesRead,TypedBytesWritten,UntypedBytesRead,UntypedBytesWritten,GtiReadThroughput,GtiWriteThroughput,QueryBeginTime,CoreFrequencyMHz,EuSliceFrequencyMHz,ReportReason,ContextId,StreamMarker, +0,162820135,43007958,49035563,1139,100,0,0,0,0,0,38909,74.4577,25.5021,29.1877,49.928,49.687,1.41558,11.3856,99.6933,0,0,0,0,0,0,0,0,0,0,0,133982149,0,8574857536,0,0,0,8571003008,7448448,417155712,7897472,70536874666,1149,1149,1,32,1179654540, ... ``` diff --git a/tools/oneprof/profiler.h b/tools/oneprof/profiler.h index cee0dc3..04bf09e 100644 --- a/tools/oneprof/profiler.h +++ b/tools/oneprof/profiler.h @@ -16,27 +16,6 @@ #include "cl_kernel_collector.h" #include "ze_kernel_collector.h" -namespace detail { - -template -uint64_t ConvertTimestamp( - uint64_t timestamp, uint64_t device_freq, - uint64_t host_sync, uint64_t device_sync) { - return timestamp; -} - -template <> -uint64_t ConvertTimestamp( - uint64_t timestamp, uint64_t device_freq, - uint64_t host_sync, uint64_t device_sync) { - PTI_ASSERT(timestamp > host_sync); - uint64_t time_shift = timestamp - host_sync; - return device_sync * static_cast(NSEC_IN_SEC) / - device_freq + time_shift; -} - -} // namespace detail - class Profiler { public: static Profiler* Create(const ProfOptions& options) { @@ -79,8 +58,11 @@ class Profiler { profiler->CheckOption(PROF_KERNEL_METRICS) || profiler->CheckOption(PROF_AGGREGATION)) { + KernelCollectorOptions kernel_options; + kernel_options.verbose = true; + ZeKernelCollector* ze_kernel_collector = ZeKernelCollector::Create( - &(profiler->correlator_), true, false); + &(profiler->correlator_), kernel_options); if (ze_kernel_collector == nullptr) { std::cout << "[WARNING] Unable to create Level Zero kernel collector" << @@ -95,7 +77,7 @@ class Profiler { "[WARNING] Unable to find target OpenCL device" << std::endl; } else { cl_kernel_collector = ClKernelCollector::Create( - device, &(profiler->correlator_), true); + device, &(profiler->correlator_), kernel_options); if (cl_kernel_collector == nullptr) { std::cout << "[WARNING] Unable to create OpenCL kernel collector" << @@ -164,13 +146,12 @@ class Profiler { ze_device_handle_t device = GetZeDevice(device_id_); PTI_ASSERT(device != nullptr); - ze_result_t result = zeDeviceGetGlobalTimestamps( - device, &host_sync_, &device_sync_); - PTI_ASSERT(result == ZE_RESULT_SUCCESS); - device_sync_ &= utils::ze::GetDeviceTimestampMask(device); - - device_freq_ = utils::ze::GetDeviceTimerFrequency(device); - PTI_ASSERT(device_freq_ > 0); + uint64_t device_freq = utils::ze::GetDeviceTimerFrequency(device); + utils::ze::GetMetricTimestamps(device, &host_sync_, &device_sync_); + host_sync_ = correlator_.GetTimestamp(host_sync_); + device_sync_ &= utils::ze::GetMetricTimestampMask(device); + device_sync_ = device_sync_ * static_cast(NSEC_IN_SEC) / device_freq; + PTI_ASSERT(device_freq > 0); } static void PrintTypedValue( @@ -198,6 +179,28 @@ class Profiler { } } + static size_t GetMetricId( + const std::vector& metric_list, + const std::string& metric_name) { + for (size_t i = 0; i < metric_list.size(); ++i) { + if (metric_list[i] == metric_name) { + return i; + } + } + return metric_list.size(); + } + + uint64_t GetHostTime( + const zet_typed_value_t* report, size_t time_id) const { + PTI_ASSERT(report != nullptr); + + PTI_ASSERT(report[time_id].type == ZET_VALUE_TYPE_UINT64); + uint64_t device_time = report[time_id].value.ui64; + + PTI_ASSERT(device_sync_ < device_time); + return host_sync_ + (device_time - device_sync_); + } + void Report() { correlator_.Log("\n"); correlator_.Log("=== Profiling Results ===\n"); @@ -278,12 +281,6 @@ class Profiler { } } - template - uint64_t ConvertTimestamp(uint64_t timestamp) const { - return detail::ConvertTimestamp( - timestamp, device_freq_, host_sync_, device_sync_); - } - template void ReportKernelInterval(const KernelInterval& interval) { std::stringstream stream; @@ -300,8 +297,8 @@ class Profiler { for (auto& device_interval : interval.device_interval_list) { std::stringstream line; line << device_interval.sub_device_id << ","; - line << ConvertTimestamp(device_interval.start) << ","; - line << ConvertTimestamp(device_interval.end) << ","; + line << device_interval.start << ","; + line << device_interval.end << ","; line << std::endl; correlator_.Log(line.str()); } @@ -369,8 +366,12 @@ class Profiler { PTI_ASSERT(!metric_list.empty()); PTI_ASSERT(metric_list.size() == report_size); + size_t report_time_id = GetMetricId(metric_list, "QueryBeginTime"); + PTI_ASSERT(report_time_id < metric_list.size()); + std::stringstream header; header << "SubDeviceId,"; + header << "HostTimestamp,"; for (auto& metric : metric_list) { header << metric << ","; } @@ -392,7 +393,11 @@ class Profiler { for (int i = 0; i < report_count; ++i) { std::stringstream line; line << sub_device_id << ","; + const zet_typed_value_t* report = report_chunk + i * report_size; + uint64_t host_time = GetHostTime(report, report_time_id); + line << host_time << ","; + for (int j = 0; j < report_size; ++j) { PrintTypedValue(line, report[j]); line << ","; @@ -429,24 +434,23 @@ class Profiler { PTI_ASSERT(report_count * report_size == report_chunk_size); const zet_typed_value_t* first_report = report_chunk; - PTI_ASSERT(first_report[report_time_id].type == ZET_VALUE_TYPE_UINT64); - if (first_report[report_time_id].value.ui64 > end) { + uint64_t first_time = GetHostTime(first_report, report_time_id); + if (first_time > end) { continue; } const zet_typed_value_t* last_report = report_chunk + (report_count - 1) * report_size; - PTI_ASSERT(last_report[report_time_id].type == ZET_VALUE_TYPE_UINT64); - if (last_report[report_time_id].value.ui64 < start) { + uint64_t last_time = GetHostTime(last_report, report_time_id); + if (last_time < start) { continue; } for (int i = 0; i < report_count; ++i) { const zet_typed_value_t* report = report_chunk + i * report_size; - zet_typed_value_t report_time = report[report_time_id]; - PTI_ASSERT(report_time.type == ZET_VALUE_TYPE_UINT64); - if (report_time.value.ui64 >= start && report_time.value.ui64 <= end) { + uint64_t host_time = GetHostTime(report, report_time_id); + if (host_time >= start && host_time <= end) { for (int j = 0; j < report_size; ++j) { target_list.push_back(report[j]); } @@ -459,17 +463,6 @@ class Profiler { return target_list; } - static size_t GetMetricId( - const std::vector& metric_list, - const std::string& metric_name) { - for (size_t i = 0; i < metric_list.size(); ++i) { - if (metric_list[i] == metric_name) { - return i; - } - } - return metric_list.size(); - } - template void ReportKernelMetrics(const KernelInterval& interval) { std::stringstream stream; @@ -489,8 +482,7 @@ class Profiler { PTI_ASSERT(report_time_id < metric_list.size()); std::vector report_list = GetMetricInterval( - ConvertTimestamp(device_interval.start), - ConvertTimestamp(device_interval.end), + device_interval.start, device_interval.end, device_interval.sub_device_id, report_time_id); uint32_t report_count = report_list.size() / report_size; PTI_ASSERT(report_count * report_size == report_list.size()); @@ -498,6 +490,7 @@ class Profiler { if (report_count > 0) { std::stringstream header; header << "SubDeviceId,"; + header << "HostTimestamp,"; for (auto& metric : metric_list) { header << metric << ","; } @@ -510,6 +503,7 @@ class Profiler { line << device_interval.sub_device_id << ","; const zet_typed_value_t* report = report_list.data() + i * report_size; + line << GetHostTime(report, report_time_id) << ","; for (int j = 0; j < report_size; ++j) { PrintTypedValue(line, report[j]); line << ","; @@ -829,8 +823,7 @@ class Profiler { PTI_ASSERT(gpu_clocks_id < metric_list.size()); std::vector report_list = GetAggregatedMetrics( - ConvertTimestamp(device_interval.start), - ConvertTimestamp(device_interval.end), + device_interval.start, device_interval.end, device_interval.sub_device_id, report_time_id, gpu_clocks_id); uint32_t report_count = report_list.size() / report_size; PTI_ASSERT(report_count * report_size == report_list.size()); @@ -838,6 +831,7 @@ class Profiler { if (report_count > 0) { std::stringstream header; header << "SubDeviceId,"; + header << "HostTimestamp,"; for (auto& metric : metric_list) { header << metric << ","; } @@ -850,6 +844,7 @@ class Profiler { line << device_interval.sub_device_id << ","; const zet_typed_value_t* report = report_list.data() + i * report_size; + line << GetHostTime(report, report_time_id) << ","; for (int j = 0; j < report_size; ++j) { PrintTypedValue(line, report[j]); line << ","; @@ -915,7 +910,6 @@ class Profiler { uint64_t host_sync_ = 0; uint64_t device_sync_ = 0; - uint64_t device_freq_ = 0; }; #endif // PTI_TOOLS_ONEPROF_PROFILER_H_ \ No newline at end of file diff --git a/tools/onetrace/README.md b/tools/onetrace/README.md index d870e53..422edaf 100644 --- a/tools/onetrace/README.md +++ b/tools/onetrace/README.md @@ -16,6 +16,7 @@ Options: --chrome-kernel-timeline Dump device activities to JSON file per kernel name --chrome-device-stages Dump device activities by stages to JSON file --verbose [-v] Enable verbose mode to show more kernel information +--demangle Demangle DPC++ kernel names --kernels-per-tile Dump kernel information per tile --tid Print thread ID into host API trace --pid Print process ID into host API and device activity trace diff --git a/tools/onetrace/tool.cc b/tools/onetrace/tool.cc index 04f3940..8ab6bd4 100644 --- a/tools/onetrace/tool.cc +++ b/tools/onetrace/tool.cc @@ -59,6 +59,10 @@ void Usage() { "--verbose [-v] " << "Enable verbose mode to show more kernel information" << std::endl; + std::cout << + "--demangle " << + "Demangle DPC++ kernel names" << + std::endl; std::cout << "--kernels-per-tile " << "Dump kernel information per tile" << @@ -128,6 +132,9 @@ int ParseArgs(int argc, char* argv[]) { strcmp(argv[i], "-v") == 0) { utils::SetEnv("ONETRACE_Verbose", "1"); ++app_index; + } else if (strcmp(argv[i], "--demangle") == 0) { + utils::SetEnv("ONETRACE_Demangle", "1"); + ++app_index; } else if (strcmp(argv[i], "--kernels-per-tile") == 0) { utils::SetEnv("ONETRACE_KernelsPerTile", "1"); ++app_index; @@ -243,6 +250,11 @@ static TraceOptions ReadArgs() { flags |= (1 << TRACE_VERBOSE); } + value = utils::GetEnv("ONETRACE_Demangle"); + if (!value.empty() && value == "1") { + flags |= (1 << TRACE_DEMANGLE); + } + value = utils::GetEnv("ONETRACE_KernelsPerTile"); if (!value.empty() && value == "1") { flags |= (1 << TRACE_KERNELS_PER_TILE); diff --git a/tools/onetrace/unified_tracer.h b/tools/onetrace/unified_tracer.h index b8bc0e4..f64233d 100644 --- a/tools/onetrace/unified_tracer.h +++ b/tools/onetrace/unified_tracer.h @@ -93,12 +93,14 @@ class UnifiedTracer { cl_callback = ClChromeStagesCallback; } - bool verbose = tracer->CheckOption(TRACE_VERBOSE); - bool kernels_per_tile = tracer->CheckOption(TRACE_KERNELS_PER_TILE); + KernelCollectorOptions kernel_options; + kernel_options.verbose = tracer->CheckOption(TRACE_VERBOSE); + kernel_options.demangle = tracer->CheckOption(TRACE_DEMANGLE); + kernel_options.kernels_per_tile = + tracer->CheckOption(TRACE_KERNELS_PER_TILE); ze_kernel_collector = ZeKernelCollector::Create( - &tracer->correlator_, verbose, - kernels_per_tile, ze_callback, tracer); + &tracer->correlator_, kernel_options, ze_callback, tracer); if (ze_kernel_collector == nullptr) { std::cerr << "[WARNING] Unable to create kernel collector for L0 backend" << @@ -108,7 +110,8 @@ class UnifiedTracer { if (cl_cpu_device != nullptr) { cl_cpu_kernel_collector = ClKernelCollector::Create( - cl_cpu_device, &tracer->correlator_, verbose, cl_callback, tracer); + cl_cpu_device, &tracer->correlator_, + kernel_options, cl_callback, tracer); if (cl_cpu_kernel_collector == nullptr) { std::cerr << "[WARNING] Unable to create kernel collector for CL CPU backend" << @@ -119,7 +122,8 @@ class UnifiedTracer { if (cl_gpu_device != nullptr) { cl_gpu_kernel_collector = ClKernelCollector::Create( - cl_gpu_device, &tracer->correlator_, verbose, cl_callback, tracer); + cl_gpu_device, &tracer->correlator_, + kernel_options, cl_callback, tracer); if (cl_gpu_kernel_collector == nullptr) { std::cerr << "[WARNING] Unable to create kernel collector for CL GPU backend" << @@ -151,13 +155,14 @@ class UnifiedTracer { cl_callback = ClChromeLoggingCallback; } - ApiCollectorOptions options{false, false, false}; - options.call_tracing = tracer->CheckOption(TRACE_CALL_LOGGING); - options.need_tid = tracer->CheckOption(TRACE_TID); - options.need_pid = tracer->CheckOption(TRACE_PID); + ApiCollectorOptions api_options; + api_options.call_tracing = tracer->CheckOption(TRACE_CALL_LOGGING); + api_options.need_tid = tracer->CheckOption(TRACE_TID); + api_options.need_pid = tracer->CheckOption(TRACE_PID); + api_options.demangle = tracer->CheckOption(TRACE_DEMANGLE); ze_api_collector = ZeApiCollector::Create( - &tracer->correlator_, options, ze_callback, tracer); + &tracer->correlator_, api_options, ze_callback, tracer); if (ze_api_collector == nullptr) { std::cerr << "[WARNING] Unable to create L0 API collector" << std::endl; @@ -167,7 +172,7 @@ class UnifiedTracer { if (cl_cpu_device != nullptr) { cl_cpu_api_collector = ClApiCollector::Create( cl_cpu_device, &tracer->correlator_, - options, cl_callback, tracer); + api_options, cl_callback, tracer); if (cl_cpu_api_collector == nullptr) { std::cerr << "[WARNING] Unable to create CL API collector for CPU backend" << @@ -179,7 +184,7 @@ class UnifiedTracer { if (cl_gpu_device != nullptr) { cl_gpu_api_collector = ClApiCollector::Create( cl_gpu_device, &tracer->correlator_, - options, cl_callback, tracer); + api_options, cl_callback, tracer); if (cl_gpu_api_collector == nullptr) { std::cerr << "[WARNING] Unable to create CL API collector for GPU backend" << diff --git a/tools/utils/correlator.h b/tools/utils/correlator.h index 157614b..657e19d 100644 --- a/tools/utils/correlator.h +++ b/tools/utils/correlator.h @@ -19,9 +19,16 @@ #include "utils.h" struct ApiCollectorOptions { - bool call_tracing; - bool need_tid; - bool need_pid; + bool call_tracing = false; + bool need_tid = false; + bool need_pid = false; + bool demangle = false; +}; + +struct KernelCollectorOptions { + bool verbose = false; + bool demangle = false; + bool kernels_per_tile = false; }; class Correlator { diff --git a/tools/utils/trace_options.h b/tools/utils/trace_options.h index 9b733d0..1dd3ecb 100644 --- a/tools/utils/trace_options.h +++ b/tools/utils/trace_options.h @@ -23,11 +23,12 @@ #define TRACE_CHROME_KERNEL_TIMELINE 7 #define TRACE_CHROME_DEVICE_STAGES 8 #define TRACE_VERBOSE 9 -#define TRACE_KERNELS_PER_TILE 10 -#define TRACE_TID 11 -#define TRACE_PID 12 -#define TRACE_LOG_TO_FILE 13 -#define TRACE_CONDITIONAL_COLLECTION 14 +#define TRACE_DEMANGLE 10 +#define TRACE_KERNELS_PER_TILE 11 +#define TRACE_TID 12 +#define TRACE_PID 13 +#define TRACE_LOG_TO_FILE 14 +#define TRACE_CONDITIONAL_COLLECTION 15 const char* kChromeTraceFileExt = "json"; diff --git a/tools/ze_tracer/README.md b/tools/ze_tracer/README.md index ff13f8c..c70189a 100644 --- a/tools/ze_tracer/README.md +++ b/tools/ze_tracer/README.md @@ -16,6 +16,7 @@ Options: --chrome-kernel-timeline Dump device activities to JSON file per kernel name --chrome-device-stages Dump device activities by stages to JSON file --verbose [-v] Enable verbose mode to show more kernel information +--demangle Demangle DPC++ kernel names --kernels-per-tile Dump kernel information per tile --tid Print thread ID into host API trace --pid Print process ID into host API and device activity trace diff --git a/tools/ze_tracer/gen_tracing_callbacks.py b/tools/ze_tracer/gen_tracing_callbacks.py index d2c6890..f08f4b4 100644 --- a/tools/ze_tracer/gen_tracing_callbacks.py +++ b/tools/ze_tracer/gen_tracing_callbacks.py @@ -354,7 +354,9 @@ def gen_enter_callback(f, func, params, enum_map): f.write(" stream << \" " + name + " = \" << *(params->p" + name + ");\n") if name.find("Kernel") >= 0 and func == "zeCommandListAppendLaunchKernel": f.write(" if (*(params->p" + name + ") != nullptr) {\n") - f.write(" std::string kernel_name = utils::ze::GetKernelName(*(params->p" + name + "));\n") + f.write(" bool demangle = collector->options_.demangle;\n") + f.write(" std::string kernel_name =\n") + f.write(" utils::ze::GetKernelName(*(params->p" + name + "), demangle);\n") f.write(" if (!kernel_name.empty()) {\n") f.write(" stream << \" (\" << kernel_name << \")\";\n") f.write(" }\n") @@ -402,7 +404,11 @@ def gen_enter_callback(f, func, params, enum_map): f.write(" } else if (strlen((*(params->p" + name + "))->pKernelName) == 0) {\n") f.write(" stream << \" " + name + " = \\\"\\\"\";\n") f.write(" } else {\n") - f.write(" stream << (*(params->p" + name + "))->pKernelName << \"}\";\n") + f.write(" stream << \"\\\"\" << (*(params->p" + name + "))->pKernelName << \"\\\"\";\n") + f.write(" if (collector->options_.demangle) {\n") + f.write(" stream << \" (\" << utils::Demangle((*(params->p" + name + "))->pKernelName) << \")\";\n") + f.write(" }\n") + f.write(" stream << \"}\";\n") f.write(" }\n") f.write(" }\n") elif type.find("ze_device_mem_alloc_desc_t*") >= 0: diff --git a/tools/ze_tracer/tool.cc b/tools/ze_tracer/tool.cc index 7c43c5d..893d065 100644 --- a/tools/ze_tracer/tool.cc +++ b/tools/ze_tracer/tool.cc @@ -60,6 +60,10 @@ void Usage() { "--verbose [-v] " << "Enable verbose mode to show more kernel information" << std::endl; + std::cout << + "--demangle " << + "Demangle DPC++ kernel names" << + std::endl; std::cout << "--kernels-per-tile " << "Dump kernel information per tile" << @@ -129,6 +133,9 @@ int ParseArgs(int argc, char* argv[]) { strcmp(argv[i], "-v") == 0) { utils::SetEnv("ZET_Verbose", "1"); ++app_index; + } else if (strcmp(argv[i], "--demangle") == 0) { + utils::SetEnv("ZET_Demangle", "1"); + ++app_index; } else if (strcmp(argv[i], "--kernels-per-tile") == 0) { utils::SetEnv("ZET_KernelsPerTile", "1"); ++app_index; @@ -244,6 +251,11 @@ static TraceOptions ReadArgs() { flags |= (1 << TRACE_VERBOSE); } + value = utils::GetEnv("ZET_Demangle"); + if (!value.empty() && value == "1") { + flags |= (1 << TRACE_DEMANGLE); + } + value = utils::GetEnv("ZET_KernelsPerTile"); if (!value.empty() && value == "1") { flags |= (1 << TRACE_KERNELS_PER_TILE); diff --git a/tools/ze_tracer/ze_api_collector.h b/tools/ze_tracer/ze_api_collector.h index 87274f5..e6f45ab 100644 --- a/tools/ze_tracer/ze_api_collector.h +++ b/tools/ze_tracer/ze_api_collector.h @@ -51,7 +51,7 @@ class ZeApiCollector { public: // User Interface static ZeApiCollector* Create( Correlator* correlator, - ApiCollectorOptions options = {false, false, false}, + ApiCollectorOptions options, OnZeFunctionFinishCallback callback = nullptr, void* callback_data = nullptr) { PTI_ASSERT(correlator != nullptr); @@ -191,7 +191,7 @@ class ZeApiCollector { std::mutex lock_; Correlator* correlator_ = nullptr; - ApiCollectorOptions options_{false, false, false}; + ApiCollectorOptions options_; OnZeFunctionFinishCallback callback_ = nullptr; void* callback_data_ = nullptr; diff --git a/tools/ze_tracer/ze_kernel_collector.h b/tools/ze_tracer/ze_kernel_collector.h index 263f247..e8f1ba1 100644 --- a/tools/ze_tracer/ze_kernel_collector.h +++ b/tools/ze_tracer/ze_kernel_collector.h @@ -50,8 +50,9 @@ struct ZeKernelCommand { ze_device_handle_t device = nullptr; uint64_t kernel_id = 0; uint64_t append_time = 0; - uint64_t timer_frequency = 0; uint64_t call_count = 0; + uint64_t timer_frequency = 0; + uint64_t timer_mask = 0; }; struct ZeKernelCall { @@ -133,8 +134,7 @@ class ZeKernelCollector { static ZeKernelCollector* Create( Correlator* correlator, - bool verbose, - bool kernels_per_tile, + KernelCollectorOptions options, OnZeKernelFinishCallback callback = nullptr, void* callback_data = nullptr) { ze_api_version_t version = utils::ze::GetVersion(); @@ -144,7 +144,7 @@ class ZeKernelCollector { PTI_ASSERT(correlator != nullptr); ZeKernelCollector* collector = new ZeKernelCollector( - correlator, verbose, kernels_per_tile, callback, callback_data); + correlator, options, callback, callback_data); PTI_ASSERT(collector != nullptr); ze_result_t status = ZE_RESULT_SUCCESS; @@ -303,13 +303,11 @@ class ZeKernelCollector { ZeKernelCollector( Correlator* correlator, - bool verbose, - bool kernels_per_tile, + KernelCollectorOptions options, OnZeKernelFinishCallback callback, void* callback_data) : correlator_(correlator), - verbose_(verbose), - kernels_per_tile_(kernels_per_tile), + options_(options), callback_(callback), callback_data_(callback_data), kernel_id_(1) { @@ -365,7 +363,7 @@ class ZeKernelCollector { PTI_ASSERT(device != nullptr); PTI_ASSERT(correlator_ != nullptr); - utils::ze::GetTimestamps(device, &host_timestamp, &device_timestamp); + utils::ze::GetDeviceTimestamps(device, &host_timestamp, &device_timestamp); host_timestamp = correlator_->GetTimestamp(host_timestamp); device_timestamp &= utils::ze::GetDeviceTimestampMask(device); } @@ -554,22 +552,23 @@ class ZeKernelCollector { } } - uint64_t ComputeDuration(uint64_t start, uint64_t end, uint64_t freq) { + uint64_t ComputeDuration( + uint64_t start, uint64_t end, uint64_t freq, uint64_t mask) { uint64_t duration = 0; if (start < end) { duration = (end - start) * static_cast(NSEC_IN_SEC) / freq; - } else { // 32-bit timer overflow - duration = ((1ull << 32) + end - start) * + } else { // Timer Overflow + duration = ((mask + 1ull) + end - start) * static_cast(NSEC_IN_SEC) / freq; } return duration; } - void ProcessCall( + void GetHostTime( const ZeKernelCall* call, const ze_kernel_timestamp_result_t& timestamp, - int tile, bool in_summary) { + uint64_t& host_start, uint64_t& host_end) { PTI_ASSERT(call != nullptr); ZeKernelCommand* command = call->command; @@ -578,21 +577,36 @@ class ZeKernelCollector { uint64_t start = timestamp.global.kernelStart; uint64_t end = timestamp.global.kernelEnd; uint64_t freq = command->timer_frequency; + uint64_t mask = command->timer_mask; PTI_ASSERT(freq > 0); PTI_ASSERT(call->submit_time > 0); PTI_ASSERT(call->device_submit_time > 0); uint64_t time_shift = - ComputeDuration(call->device_submit_time, start, freq); - uint64_t duration = ComputeDuration(start, end, freq); + ComputeDuration(call->device_submit_time, start, freq, mask); + uint64_t duration = ComputeDuration(start, end, freq, mask); + + host_start = call->submit_time + time_shift; + host_end = host_start + duration; + } - uint64_t host_start = call->submit_time + time_shift; - uint64_t host_end = host_start + duration; + void ProcessCall( + const ZeKernelCall* call, + const ze_kernel_timestamp_result_t& timestamp, + int tile, bool in_summary) { + PTI_ASSERT(call != nullptr); + + ZeKernelCommand* command = call->command; + PTI_ASSERT(command != nullptr); + + uint64_t host_start = 0, host_end = 0; + GetHostTime(call, timestamp, host_start, host_end); + PTI_ASSERT(host_start <= host_end); std::string name = command->props.name; PTI_ASSERT(!name.empty()); - if (verbose_) { + if (options_.verbose) { name = GetVerboseName(&command->props); } @@ -640,7 +654,7 @@ class ZeKernelCollector { if (call->need_to_process) { #ifdef PTI_KERNEL_INTERVALS - AddKernelInterval(command); + AddKernelInterval(call); #else // PTI_KERNEL_INTERVALS ze_result_t status = ZE_RESULT_SUCCESS; status = zeEventQueryStatus(command->event); @@ -650,7 +664,7 @@ class ZeKernelCollector { status = zeEventQueryKernelTimestamp(command->event, ×tamp); PTI_ASSERT(status == ZE_RESULT_SUCCESS); - if (kernels_per_tile_ && command->props.simd_width > 0) { + if (options_.kernels_per_tile && command->props.simd_width > 0) { if (device_map_.count(command->device) == 1 && !device_map_[command->device].empty()) { // Implicit Scaling uint32_t count = 0; @@ -772,13 +786,16 @@ class ZeKernelCollector { } #ifdef PTI_KERNEL_INTERVALS - void AddKernelInterval(const ZeKernelCommand* command) { + void AddKernelInterval(const ZeKernelCall* call) { + PTI_ASSERT(call != nullptr); + + const ZeKernelCommand* command = call->command; PTI_ASSERT(command != nullptr); std::string name = command->props.name; PTI_ASSERT(!name.empty()); - if (verbose_) { + if (options_.verbose) { name = GetVerboseName(&command->props); } @@ -801,18 +818,12 @@ class ZeKernelCollector { ZeKernelInterval kernel_interval{ name, command->device, std::vector()}; for (uint32_t i = 0; i < count; ++i) { - uint64_t start = timestamps[i].global.kernelStart; - uint64_t end = timestamps[i].global.kernelEnd; - uint64_t freq = command->timer_frequency; - PTI_ASSERT(freq > 0); - - uint64_t duration = ComputeDuration(start, end, freq); - uint64_t start_ns = start * - static_cast(NSEC_IN_SEC) / freq; - uint64_t end_ns = start_ns + duration; - PTI_ASSERT(start_ns < end_ns); - - kernel_interval.device_interval_list.push_back({start_ns, end_ns, i}); + uint64_t host_start = 0, host_end = 0; + GetHostTime(call, timestamps[i], host_start, host_end); + PTI_ASSERT(host_start <= host_end); + + kernel_interval.device_interval_list.push_back( + {host_start, host_end, i}); } kernel_interval_list_.push_back(kernel_interval); } else { // Explicit scaling @@ -820,16 +831,9 @@ class ZeKernelCollector { status = zeEventQueryKernelTimestamp(command->event, ×tamp); PTI_ASSERT(status == ZE_RESULT_SUCCESS); - uint64_t start = timestamp.global.kernelStart; - uint64_t end = timestamp.global.kernelEnd; - uint64_t freq = command->timer_frequency; - PTI_ASSERT(freq > 0); - - uint64_t duration = ComputeDuration(start, end, freq); - uint64_t start_ns = start * - static_cast(NSEC_IN_SEC) / freq; - uint64_t end_ns = start_ns + duration; - PTI_ASSERT(start_ns < end_ns); + uint64_t host_start = 0, host_end = 0; + GetHostTime(call, timestamp, host_start, host_end); + PTI_ASSERT(host_start <= host_end); if (device_map_.count(command->device) == 0) { // Subdevice ze_device_handle_t device = GetDeviceForSubDevice(command->device); @@ -840,13 +844,14 @@ class ZeKernelCollector { ZeKernelInterval kernel_interval{ name, device, std::vector()}; kernel_interval.device_interval_list.push_back( - {start_ns, end_ns, static_cast(sub_device_id)}); + {host_start, host_end, static_cast(sub_device_id)}); kernel_interval_list_.push_back(kernel_interval); } else { // Device with no subdevices PTI_ASSERT(device_map_[command->device].empty()); ZeKernelInterval kernel_interval{ name, command->device, std::vector()}; - kernel_interval.device_interval_list.push_back({start_ns, end_ns, 0}); + kernel_interval.device_interval_list.push_back( + {host_start, host_end, 0}); kernel_interval_list_.push_back(kernel_interval); } } @@ -1159,6 +1164,8 @@ class ZeKernelCollector { command->device = device; command->timer_frequency = utils::ze::GetDeviceTimerFrequency(device); PTI_ASSERT(command->timer_frequency > 0); + command->timer_mask = utils::ze::GetDeviceTimestampMask(device); + PTI_ASSERT(command->timer_mask > 0); if (signal_event == nullptr) { ze_context_handle_t context = @@ -1229,7 +1236,8 @@ class ZeKernelCollector { ZeKernelProps props{}; - props.name = utils::ze::GetKernelName(kernel); + props.name = utils::ze::GetKernelName( + kernel, collector->options_.demangle); props.simd_width = utils::ze::GetKernelMaxSubgroupSize(kernel); props.bytes_transferred = 0; @@ -1896,8 +1904,7 @@ class ZeKernelCollector { private: // Data zel_tracer_handle_t tracer_ = nullptr; - bool verbose_ = false; - bool kernels_per_tile_ = false; + KernelCollectorOptions options_; Correlator* correlator_ = nullptr; std::atomic kernel_id_; diff --git a/tools/ze_tracer/ze_tracer.h b/tools/ze_tracer/ze_tracer.h index e7718a7..8eae4b6 100644 --- a/tools/ze_tracer/ze_tracer.h +++ b/tools/ze_tracer/ze_tracer.h @@ -67,11 +67,14 @@ class ZeTracer { callback = ChromeStagesCallback; } + KernelCollectorOptions kernel_options; + kernel_options.verbose = tracer->CheckOption(TRACE_VERBOSE); + kernel_options.demangle = tracer->CheckOption(TRACE_DEMANGLE); + kernel_options.kernels_per_tile = + tracer->CheckOption(TRACE_KERNELS_PER_TILE); + kernel_collector = ZeKernelCollector::Create( - &(tracer->correlator_), - tracer->CheckOption(TRACE_VERBOSE), - tracer->CheckOption(TRACE_KERNELS_PER_TILE), - callback, tracer); + &(tracer->correlator_), kernel_options, callback, tracer); if (kernel_collector == nullptr) { std::cerr << "[WARNING] Unable to create kernel collector" << std::endl; @@ -91,13 +94,14 @@ class ZeTracer { callback = ChromeLoggingCallback; } - ApiCollectorOptions options{false, false, false}; - options.call_tracing = tracer->CheckOption(TRACE_CALL_LOGGING); - options.need_tid = tracer->CheckOption(TRACE_TID); - options.need_pid = tracer->CheckOption(TRACE_PID); + ApiCollectorOptions api_options; + api_options.call_tracing = tracer->CheckOption(TRACE_CALL_LOGGING); + api_options.need_tid = tracer->CheckOption(TRACE_TID); + api_options.need_pid = tracer->CheckOption(TRACE_PID); + api_options.demangle = tracer->CheckOption(TRACE_DEMANGLE); api_collector = ZeApiCollector::Create( - &(tracer->correlator_), options, callback, tracer); + &(tracer->correlator_), api_options, callback, tracer); if (api_collector == nullptr) { std::cerr << "[WARNING] Unable to create API collector" << std::endl; delete tracer; diff --git a/utils/cl_utils.h b/utils/cl_utils.h index 6806e95..f88a746 100644 --- a/utils/cl_utils.h +++ b/utils/cl_utils.h @@ -119,17 +119,20 @@ inline cl_device_id GetDeviceParent(cl_device_id device) { return parent; } -inline std::string GetKernelName(cl_kernel kernel) { +inline std::string GetKernelName(cl_kernel kernel, bool demangle = false) { PTI_ASSERT(kernel != nullptr); char name[MAX_STR_SIZE] = { 0 }; cl_int status = CL_SUCCESS; - status = clGetKernelInfo(kernel, CL_KERNEL_FUNCTION_NAME, - MAX_STR_SIZE, name, nullptr); + status = clGetKernelInfo( + kernel, CL_KERNEL_FUNCTION_NAME, MAX_STR_SIZE, name, nullptr); PTI_ASSERT(status == CL_SUCCESS); - return demangle(name); + if (demangle) { + return utils::Demangle(name); + } + return name; } inline std::string GetDeviceName(cl_device_id device) { @@ -504,4 +507,4 @@ inline const char* GetErrorString(cl_int error) { } // namespace cl } // namespace utils -#endif // PTI_UTILS_CL_UTILS_H_ +#endif // PTI_UTILS_CL_UTILS_H_ \ No newline at end of file diff --git a/utils/demangle.h b/utils/demangle.h index 909695c..b46e338 100644 --- a/utils/demangle.h +++ b/utils/demangle.h @@ -10,27 +10,29 @@ #endif #include +#include "pti_assert.h" + namespace utils { -static inline std::string demangle(const char* name) { - if (!name) - { - return std::string{}; - } +static inline std::string Demangle(const char* name) { + PTI_ASSERT(name != nullptr); + #if HAVE_CXXABI - int status; - char *demangled = abi::__cxa_demangle(name, nullptr, 0, &status); + int status = 0; + char* demangled = abi::__cxa_demangle(name, nullptr, 0, &status); if (status != 0) { return name; } - - constexpr const char *const prefixToSkip = "typeinfo name for "; - const size_t prefixToSkipLen = strlen(prefixToSkip); - const size_t shift = (std::strncmp(demangled, prefixToSkip, prefixToSkipLen) == 0) ? prefixToSkipLen : 0; - std::string retVal(demangled + shift); + constexpr const char* const prefix_to_skip = "typeinfo name for "; + const size_t prefix_to_skip_len = strlen(prefix_to_skip); + const size_t shift = + (std::strncmp(demangled, prefix_to_skip, prefix_to_skip_len) == 0) ? + prefix_to_skip_len : 0; + + std::string result(demangled + shift); free(demangled); - return retVal; + return result; #else return name; #endif @@ -40,4 +42,4 @@ static inline std::string demangle(const char* name) { #undef HAVE_CXXABI -#endif // PTI_UTILS_DEMANGLE_H_ +#endif // PTI_UTILS_DEMANGLE_H_ \ No newline at end of file diff --git a/utils/ze_utils.h b/utils/ze_utils.h index 46af0f5..1b7926b 100644 --- a/utils/ze_utils.h +++ b/utils/ze_utils.h @@ -290,7 +290,8 @@ inline size_t GetKernelMaxSubgroupSize(ze_kernel_handle_t kernel) { return props.maxSubgroupSize; } -inline std::string GetKernelName(ze_kernel_handle_t kernel) { +inline std::string GetKernelName( + ze_kernel_handle_t kernel, bool demangle = false) { PTI_ASSERT(kernel != nullptr); size_t size = 0; @@ -301,12 +302,15 @@ inline std::string GetKernelName(ze_kernel_handle_t kernel) { std::vector name(size); status = zeKernelGetName(kernel, &size, name.data()); PTI_ASSERT(status == ZE_RESULT_SUCCESS); - PTI_ASSERT(name[size - 1] == '\0'); - return demangle(name.data()); + + if (demangle) { + return utils::Demangle(name.data()); + } + return std::string(name.begin(), name.end() - 1); } -inline void GetTimestamps( +inline void GetDeviceTimestamps( ze_device_handle_t device, uint64_t* host_timestamp, uint64_t* device_timestamp) { @@ -318,6 +322,19 @@ inline void GetTimestamps( PTI_ASSERT(status == ZE_RESULT_SUCCESS); } +inline void GetMetricTimestamps( + ze_device_handle_t device, + uint64_t* host_timestamp, + uint64_t* metric_timestamp) { + PTI_ASSERT(device != nullptr); + PTI_ASSERT(host_timestamp != nullptr); + PTI_ASSERT(metric_timestamp != nullptr); + // TODO: replace with zeMetricGetGlobalTimestamps + ze_result_t status = zeDeviceGetGlobalTimestamps( + device, host_timestamp, metric_timestamp); + PTI_ASSERT(status == ZE_RESULT_SUCCESS); +} + inline uint64_t GetDeviceTimerFrequency(ze_device_handle_t device) { PTI_ASSERT(device != nullptr); ze_device_properties_t props{ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES_1_2, }; @@ -334,6 +351,14 @@ uint64_t GetDeviceTimestampMask(ze_device_handle_t device) { return (1ull << props.kernelTimestampValidBits) - 1ull; } +uint64_t GetMetricTimestampMask(ze_device_handle_t device) { + PTI_ASSERT(device != nullptr); + ze_device_properties_t props{ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES_1_2, }; + ze_result_t status = zeDeviceGetProperties(device, &props); + PTI_ASSERT(status == ZE_RESULT_SUCCESS); + return (1ull << props.kernelTimestampValidBits) - 1ull; +} + inline ze_api_version_t GetDriverVersion(ze_driver_handle_t driver) { PTI_ASSERT(driver != nullptr); @@ -355,4 +380,4 @@ inline ze_api_version_t GetVersion() { } // namespace ze } // namespace utils -#endif // PTI_UTILS_ZE_UTILS_H_ +#endif // PTI_UTILS_ZE_UTILS_H_ \ No newline at end of file