Skip to content

Commit

Permalink
Redesign time correlation in oneprof, improve DPC++ kernel demangling
Browse files Browse the repository at this point in the history
  • Loading branch information
anton-v-gorshkov committed Jan 11, 2022
1 parent b461461 commit e16ec20
Show file tree
Hide file tree
Showing 29 changed files with 352 additions and 246 deletions.
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.42.1
0.42.4
1 change: 1 addition & 0 deletions build_utils/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,7 @@ macro(FindIGALibrary TARGET)
"IGA is not found. "
"You may need to install Intel(R) Processor Graphics Driver to fix this issue.")
else()
list(GET IGA_DLL_PATH 0 IGA_DLL_PATH)
message(STATUS
"IGA is found at ${IGA_DLL_PATH}")
find_library(IGA_LIB_PATH
Expand Down
6 changes: 2 additions & 4 deletions loader/loader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -63,12 +63,10 @@ static bool CheckBitness(HANDLE parent, HANDLE child) {
int main(int argc, char* argv[]) {
std::string library_file_name = GetLibFileName();
std::string executable_path = utils::GetExecutablePath();
std::string library_file_path = executable_path + library_file_name;

std::string library_file_path = executable_path + library_file_name;
if (!IsFileExists(library_file_path.c_str())) {
std::cout << "[ERROR] Failed to find " <<
library_file_name << " near the loader" << std::endl;
return 0;
library_file_path = library_file_name;
}

SharedLibrary* lib = SharedLibrary::Create(library_file_path);
Expand Down
3 changes: 3 additions & 0 deletions tests/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
["sysmon", "-p", "-l", "-d"],
["onetrace",
"-c", "-h", "-d", "-v", "-t", "-s",
"--demangle",
"--kernels-per-tile",
"--chrome-call-logging",
"--chrome-device-timeline",
Expand All @@ -44,6 +45,7 @@
"cl", "ze", "omp"],
["cl_tracer",
"-c", "-h", "-d", "-v", "-t", "-s",
"--demangle",
"--chrome-call-logging",
"--chrome-device-timeline",
"--chrome-kernel-timeline",
Expand All @@ -52,6 +54,7 @@
"gpu", "dpc", "omp"],
["ze_tracer",
"-c", "-h", "-d", "-v", "-t", "-s",
"--demangle",
"--kernels-per-tile",
"--chrome-call-logging",
"--chrome-device-timeline",
Expand Down
4 changes: 3 additions & 1 deletion tests/tools/cl_tracer.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def run(path, option):
if option == "gpu":
command = [file_name_prefix + "cl_tracer" + file_extention,\
"-h", "-d", "-t", app_file, "gpu", "1024", "1"]
elif option == "-v" or option == "--conditional-collection":
elif option == "-v" or option == "--demangle" or option == "--conditional-collection":
command = [file_name_prefix + "cl_tracer" + file_extention,\
"-d", option, app_file, "cpu", "1024", "1"]
else:
Expand Down Expand Up @@ -102,6 +102,8 @@ def main(option):
option = "-s"
if len(sys.argv) > 1 and sys.argv[1] == "-v":
option = "-v"
if len(sys.argv) > 1 and sys.argv[1] == "--demangle":
option = "--demangle"
if len(sys.argv) > 1 and sys.argv[1] == "--chrome-call-logging":
option = "--chrome-call-logging"
if len(sys.argv) > 1 and sys.argv[1] == "--chrome-device-timeline":
Expand Down
5 changes: 4 additions & 1 deletion tests/tools/onetrace.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,8 @@ def run(path, option):
app_folder = utils.get_sample_executable_path("omp_gemm")
app_file = os.path.join(app_folder, "omp_gemm")
command = ["./onetrace", "-h", "-d", "-t", app_file, "gpu", "1024", "1"]
elif option == "-v" or option == "--kernels-per-tile" or option == "--conditional-collection":
elif option == "-v" or option == "--demangle" or\
option == "--kernels-per-tile" or option == "--conditional-collection":
app_folder = utils.get_sample_executable_path("dpc_gemm")
app_file = os.path.join(app_folder, "dpc_gemm")
command = ["./onetrace", "-d", option, app_file, "gpu", "1024", "1"]
Expand Down Expand Up @@ -88,6 +89,8 @@ def main(option):
option = "-s"
if len(sys.argv) > 1 and sys.argv[1] == "-v":
option = "-v"
if len(sys.argv) > 1 and sys.argv[1] == "--demangle":
option = "--demangle"
if len(sys.argv) > 1 and sys.argv[1] == "--kernels-per-tile":
option = "--kernels-per-tile"
if len(sys.argv) > 1 and sys.argv[1] == "--chrome-call-logging":
Expand Down
5 changes: 4 additions & 1 deletion tests/tools/ze_tracer.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@ def run(path, option):
app_folder = utils.get_sample_executable_path("omp_gemm")
app_file = os.path.join(app_folder, "omp_gemm")
command = ["./ze_tracer", "-h", "-d", "-t", app_file, "gpu", "1024", "1"]
elif option == "-v" or option == "--kernels-per-tile" or option == "--conditional-collection":
elif option == "-v" or option == "--demangle" or\
option == "--kernels-per-tile" or option == "--conditional-collection":
app_folder = utils.get_sample_executable_path("ze_gemm")
app_file = os.path.join(app_folder, "ze_gemm")
command = ["./ze_tracer", "-d", option, app_file, "1024", "1"]
Expand Down Expand Up @@ -81,6 +82,8 @@ def main(option):
option = "-s"
if len(sys.argv) > 1 and sys.argv[1] == "-v":
option = "-v"
if len(sys.argv) > 1 and sys.argv[1] == "--demangle":
option = "--demangle"
if len(sys.argv) > 1 and sys.argv[1] == "--kernels-per-tile":
option = "--kernels-per-tile"
if len(sys.argv) > 1 and sys.argv[1] == "--chrome-call-logging":
Expand Down
1 change: 1 addition & 0 deletions tools/cl_tracer/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ Options:
--chrome-kernel-timeline Dump device activities to JSON file per kernel name
--chrome-device-stages Dump device activities by stages to JSON file
--verbose [-v] Enable verbose mode to show more kernel information
--demangle Demangle DPC++ kernel names
--tid Print thread ID into host API trace
--pid Print process ID into host API and device activity trace
--output [-o] <filename> Print console logs into the file
Expand Down
9 changes: 7 additions & 2 deletions tools/cl_tracer/cl_api_callbacks.h
Original file line number Diff line number Diff line change
Expand Up @@ -2931,7 +2931,8 @@ static void clEnqueueTaskOnEnter(
stream << " commandQueue = " << *(params->commandQueue);
stream << " kernel = " << *(params->kernel);
if (*(params->kernel) != nullptr) {
std::string kernel_name = utils::cl::GetKernelName(*(params->kernel));
std::string kernel_name = utils::cl::GetKernelName(
*(params->kernel), collector->Demangle());
if (!kernel_name.empty()) {
stream << " (" << kernel_name << ")";
}
Expand Down Expand Up @@ -3582,7 +3583,8 @@ static void clEnqueueNDRangeKernelOnEnter(
stream << " commandQueue = " << *(params->commandQueue);
stream << " kernel = " << *(params->kernel);
if (*(params->kernel) != nullptr) {
std::string kernel_name = utils::cl::GetKernelName(*(params->kernel));
std::string kernel_name = utils::cl::GetKernelName(
*(params->kernel), collector->Demangle());
if (!kernel_name.empty()) {
stream << " (" << kernel_name << ")";
}
Expand Down Expand Up @@ -5966,6 +5968,9 @@ static void clCreateKernelOnEnter(
stream << " kernelName = \"\"";
} else {
stream << " kernelName = \"" << *(params->kernelName) << "\"";
if (collector->Demangle()) {
stream << " (" << utils::Demangle(*(params->kernelName)) << ")";
}
}
stream << " errcodeRet = " << *(params->errcodeRet);
stream << std::endl;
Expand Down
8 changes: 6 additions & 2 deletions tools/cl_tracer/cl_api_collector.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ class ClApiCollector {
static ClApiCollector* Create(
cl_device_id device,
Correlator* correlator,
ApiCollectorOptions options = {false, false, false},
ApiCollectorOptions options,
OnClFunctionFinishCallback callback = nullptr,
void* callback_data = nullptr) {
PTI_ASSERT(device != nullptr);
Expand Down Expand Up @@ -115,6 +115,10 @@ class ClApiCollector {
return options_.need_pid;
}

bool Demangle() const {
return options_.demangle;
}

void Log(const std::string& text) {
PTI_ASSERT(correlator_ != nullptr);
correlator_->Log(text);
Expand Down Expand Up @@ -332,7 +336,7 @@ class ClApiCollector {
ClApiTracer* tracer_ = nullptr;

Correlator* correlator_ = nullptr;
ApiCollectorOptions options_ = {false, false, false};
ApiCollectorOptions options_;
cl_device_type device_type_ = CL_DEVICE_TYPE_ALL;

OnClFunctionFinishCallback callback_ = nullptr;
Expand Down
42 changes: 17 additions & 25 deletions tools/cl_tracer/cl_kernel_collector.h
Original file line number Diff line number Diff line change
Expand Up @@ -113,15 +113,15 @@ class ClKernelCollector {
static ClKernelCollector* Create(
cl_device_id device,
Correlator* correlator,
bool verbose,
KernelCollectorOptions options,
OnClKernelFinishCallback callback = nullptr,
void* callback_data = nullptr) {
PTI_ASSERT(device != nullptr);
PTI_ASSERT(correlator != nullptr);
TraceGuard guard;

ClKernelCollector* collector = new ClKernelCollector(
device, correlator, verbose, callback, callback_data);
device, correlator, options, callback, callback_data);
PTI_ASSERT(collector != nullptr);

ClApiTracer* tracer = new ClApiTracer(device, Callback, collector);
Expand Down Expand Up @@ -281,12 +281,12 @@ class ClKernelCollector {
ClKernelCollector(
cl_device_id device,
Correlator* correlator,
bool verbose,
KernelCollectorOptions options,
OnClKernelFinishCallback callback,
void* callback_data)
: device_(device),
correlator_(correlator),
verbose_(verbose),
options_(options),
callback_(callback),
callback_data_(callback_data),
kernel_id_(1) {
Expand Down Expand Up @@ -443,6 +443,13 @@ class ClKernelCollector {
PTI_ASSERT(device != nullptr);
AddKernelInterval(instance, device, started, ended);
#else // PTI_KERNEL_INTERVALS
std::string name = instance->props.name;
PTI_ASSERT(!name.empty());

if (options_.verbose) {
name = GetVerboseName(&(instance->props));
}

uint64_t host_queued = 0, host_submitted = 0;
uint64_t host_started = 0, host_ended = 0;
ComputeHostTimestamps(
Expand All @@ -451,19 +458,12 @@ class ClKernelCollector {
host_queued, host_submitted,
host_started, host_ended);
AddKernelInfo(
&instance->props,
name,
host_submitted - host_queued,
host_started - host_submitted,
host_ended - host_started);

if (callback_ != nullptr) {
std::string name = instance->props.name;
PTI_ASSERT(!name.empty());

if (verbose_) {
name = GetVerboseName(&(instance->props));
}

std::stringstream stream;
stream << std::hex << queue;

Expand Down Expand Up @@ -551,17 +551,10 @@ class ClKernelCollector {
}

void AddKernelInfo(
const ClKernelProps* props, uint64_t queued_time,
std::string name, uint64_t queued_time,
uint64_t submit_time, uint64_t execute_time) {
PTI_ASSERT(props != nullptr);

std::string name = props->name;
PTI_ASSERT(!name.empty());

if (verbose_) {
name = GetVerboseName(props);
}

if (kernel_info_map_.count(name) == 0) {
ClKernelInfo info;
info.queued_time = queued_time;
Expand Down Expand Up @@ -606,7 +599,7 @@ class ClKernelCollector {
std::string name = instance->props.name;
PTI_ASSERT(!name.empty());

if (verbose_) {
if (options_.verbose) {
name = GetVerboseName(&instance->props);
}

Expand Down Expand Up @@ -755,11 +748,9 @@ class ClKernelCollector {
cl_ulong host_timestamp = 0;
utils::cl::GetTimestamps(
collector->device_, &enqueue_data->host_sync, &enqueue_data->device_sync);
#ifndef PTI_KERNEL_INTERVALS
PTI_ASSERT(collector->correlator_ != nullptr);
enqueue_data->host_sync =
collector->correlator_->GetTimestamp(enqueue_data->host_sync);
#endif

const T* params = reinterpret_cast<const T*>(data->functionParams);
PTI_ASSERT(params != nullptr);
Expand Down Expand Up @@ -797,7 +788,8 @@ class ClKernelCollector {
instance->event = **(params->event);

cl_kernel kernel = *(params->kernel);
instance->props.name = utils::cl::GetKernelName(kernel);
instance->props.name = utils::cl::GetKernelName(
kernel, collector->options_.demangle);

cl_command_queue queue = *(params->commandQueue);
PTI_ASSERT(queue != nullptr);
Expand Down Expand Up @@ -1401,7 +1393,7 @@ class ClKernelCollector {
ClApiTracer* tracer_ = nullptr;
Correlator* correlator_ = nullptr;

bool verbose_ = false;
KernelCollectorOptions options_;

std::atomic<uint64_t> kernel_id_;
cl_device_id device_ = nullptr;
Expand Down
23 changes: 13 additions & 10 deletions tools/cl_tracer/cl_tracer.h
Original file line number Diff line number Diff line change
Expand Up @@ -81,11 +81,14 @@ class ClTracer {
callback = ChromeStagesCallback;
}

KernelCollectorOptions kernel_options;
kernel_options.verbose = tracer->CheckOption(TRACE_VERBOSE);
kernel_options.demangle = tracer->CheckOption(TRACE_DEMANGLE);

if (cpu_device != nullptr) {
cpu_kernel_collector = ClKernelCollector::Create(
cpu_device, &tracer->correlator_,
tracer->CheckOption(TRACE_VERBOSE),
callback, tracer);
kernel_options, callback, tracer);
if (cpu_kernel_collector == nullptr) {
std::cerr <<
"[WARNING] Unable to create kernel collector for CPU backend" <<
Expand All @@ -97,8 +100,7 @@ class ClTracer {
if (gpu_device != nullptr) {
gpu_kernel_collector = ClKernelCollector::Create(
gpu_device, &tracer->correlator_,
tracer->CheckOption(TRACE_VERBOSE),
callback, tracer);
kernel_options, callback, tracer);
if (gpu_kernel_collector == nullptr) {
std::cerr <<
"[WARNING] Unable to create kernel collector for GPU backend" <<
Expand All @@ -125,15 +127,16 @@ class ClTracer {
callback = ChromeLoggingCallback;
}

ApiCollectorOptions cl_api_options{false, false, false};
cl_api_options.call_tracing = tracer->CheckOption(TRACE_CALL_LOGGING);
cl_api_options.need_tid = tracer->CheckOption(TRACE_TID);
cl_api_options.need_pid = tracer->CheckOption(TRACE_PID);
ApiCollectorOptions api_options;
api_options.call_tracing = tracer->CheckOption(TRACE_CALL_LOGGING);
api_options.need_tid = tracer->CheckOption(TRACE_TID);
api_options.need_pid = tracer->CheckOption(TRACE_PID);
api_options.demangle = tracer->CheckOption(TRACE_DEMANGLE);

if (cpu_device != nullptr) {
cpu_api_collector = ClApiCollector::Create(
cpu_device, &tracer->correlator_,
cl_api_options, callback, tracer);
api_options, callback, tracer);
if (cpu_api_collector == nullptr) {
std::cerr <<
"[WARNING] Unable to create API collector for CPU backend" <<
Expand All @@ -145,7 +148,7 @@ class ClTracer {
if (gpu_device != nullptr) {
gpu_api_collector = ClApiCollector::Create(
gpu_device, &tracer->correlator_,
cl_api_options, callback, tracer);
api_options, callback, tracer);
if (gpu_api_collector == nullptr) {
std::cerr <<
"[WARNING] Unable to create API collector for GPU backend" <<
Expand Down
12 changes: 12 additions & 0 deletions tools/cl_tracer/tool.cc
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,10 @@ void Usage() {
"--verbose [-v] " <<
"Enable verbose mode to show more kernel information" <<
std::endl;
std::cout <<
"--demangle " <<
"Demangle DPC++ kernel names" <<
std::endl;
std::cout <<
"--tid " <<
"Print thread ID into host API trace" <<
Expand Down Expand Up @@ -125,6 +129,9 @@ int ParseArgs(int argc, char* argv[]) {
strcmp(argv[i], "-v") == 0) {
utils::SetEnv("CLT_Verbose", "1");
++app_index;
} else if (strcmp(argv[i], "--demangle") == 0) {
utils::SetEnv("CLT_Demangle", "1");
++app_index;
} else if (strcmp(argv[i], "--tid") == 0) {
utils::SetEnv("CLT_Tid", "1");
++app_index;
Expand Down Expand Up @@ -235,6 +242,11 @@ static TraceOptions ReadArgs() {
flags |= (1 << TRACE_VERBOSE);
}

value = utils::GetEnv("CLT_Demangle");
if (!value.empty() && value == "1") {
flags |= (1 << TRACE_DEMANGLE);
}

value = utils::GetEnv("CLT_Tid");
if (!value.empty() && value == "1") {
flags |= (1 << TRACE_TID);
Expand Down
Loading

0 comments on commit e16ec20

Please sign in to comment.