Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit d84635b

Browse filesBrowse files
authored
opencl: improve profiling (ggml-org#12442)
* opencl: more profiling timing * opencl: generate trace for profiling * opencl: reduce profiling overhead * Populate profiling timing info at the end rather than after each kernel run * opencl: fix for chrome tracing
1 parent 75422e8 commit d84635b
Copy full SHA for d84635b

File tree

Expand file treeCollapse file tree

1 file changed

+95
-22
lines changed
Filter options
Expand file treeCollapse file tree

1 file changed

+95
-22
lines changed

‎ggml/src/ggml-opencl/ggml-opencl.cpp

Copy file name to clipboardExpand all lines: ggml/src/ggml-opencl/ggml-opencl.cpp
+95-22Lines changed: 95 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -297,8 +297,27 @@ static int ggml_backend_opencl_n_devices = 0;
297297
struct ProfilingInfo {
298298
std::string op_name;
299299
std::string kernel_name;
300-
// Kernel execution time in nanoseconds.
301-
cl_ulong duration_ns;
300+
301+
cl_kernel kernel;
302+
cl_event evt;
303+
304+
cl_ulong cmd_queued;
305+
cl_ulong cmd_submit;
306+
cl_ulong cmd_start;
307+
cl_ulong cmd_end;
308+
cl_ulong overhead_start;
309+
cl_ulong overhead_end;
310+
// For the times below, see spec for clGetEventProfilingInfo
311+
// The time kernel spent in cmd queue - SUBMIT - QUEUED
312+
cl_ulong cmd_queued_duration_ns;
313+
// The time kernel spent for submission - START - SUBMIT
314+
cl_ulong cmd_submit_duration_ns;
315+
// Kernel execution time in nanoseconds - END - START
316+
cl_ulong cmd_duration_ns;
317+
// The time for the kernel to complete - COMPLETE - END
318+
cl_ulong cmd_complete_duration_ns;
319+
// Total time to finish the kernel - COMPELTE - QUEUED
320+
cl_ulong cmd_total_duration_ns;
302321
// Global and local work sizes.
303322
size_t global_size[3];
304323
size_t local_size[3];
@@ -903,19 +922,84 @@ static void ggml_cl2_free(void) {
903922
return;
904923
}
905924

925+
// Populate profiling info
926+
for (ProfilingInfo & info : g_profiling_info) {
927+
cl_ulong cmd_queued;
928+
cl_ulong cmd_submit;
929+
cl_ulong cmd_start;
930+
cl_ulong cmd_end;
931+
cl_ulong cmd_complete;
932+
933+
CL_CHECK(clWaitForEvents(1, &info.evt));
934+
CL_CHECK(clGetEventProfilingInfo(
935+
info.evt, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &cmd_queued, NULL));
936+
CL_CHECK(clGetEventProfilingInfo(
937+
info.evt, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &cmd_submit, NULL));
938+
CL_CHECK(clGetEventProfilingInfo(
939+
info.evt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &cmd_start, NULL));
940+
CL_CHECK(clGetEventProfilingInfo(
941+
info.evt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &cmd_end, NULL));
942+
CL_CHECK(clGetEventProfilingInfo(
943+
info.evt, CL_PROFILING_COMMAND_COMPLETE, sizeof(cl_ulong), &cmd_complete, NULL));
944+
CL_CHECK(clReleaseEvent(info.evt));
945+
946+
char kernel_name[512];
947+
CL_CHECK(clGetKernelInfo(info.kernel, CL_KERNEL_FUNCTION_NAME,
948+
sizeof(kernel_name), kernel_name, NULL));
949+
info.kernel_name = kernel_name;
950+
951+
info.cmd_queued = cmd_queued;
952+
info.cmd_submit = cmd_submit;
953+
info.cmd_start = cmd_start;
954+
info.cmd_end = cmd_end;
955+
956+
info.cmd_queued_duration_ns = cmd_submit - cmd_queued;
957+
info.cmd_submit_duration_ns = cmd_start - cmd_submit;
958+
info.cmd_duration_ns = cmd_end - cmd_start;
959+
info.cmd_complete_duration_ns = cmd_complete - cmd_end;
960+
info.cmd_total_duration_ns = cmd_complete - cmd_queued;
961+
}
962+
963+
// Dump a csv
906964
float total_kernel_time = 0;
907-
fprintf(fperf, "op name, kernel name, duration (ms), global size, local size, output size\n");
965+
fprintf(fperf, "op name, kernel name, queued duration (ms), submit duration(ms), exec duration (ms), complete duration (ms), total duration (ms), global size, local size, output size\n");
908966
for (const ProfilingInfo & info : g_profiling_info) {
909-
total_kernel_time += info.duration_ns/1.e6f;
910-
fprintf(fperf, "%s,%s,%f,%zux%zux%zu,%zux%zux%zu,%zux%zux%zux%zu\n",
911-
info.op_name.c_str(), info.kernel_name.c_str(), info.duration_ns/1.e6f,
967+
total_kernel_time += info.cmd_duration_ns/1.e6f;
968+
fprintf(fperf, "%s,%s,%f,%f,%f,%f,%f,%zux%zux%zu,%zux%zux%zu,%zux%zux%zux%zu\n",
969+
info.op_name.c_str(), info.kernel_name.c_str(),
970+
info.cmd_queued_duration_ns/1.e6f,
971+
info.cmd_submit_duration_ns/1.e6f,
972+
info.cmd_duration_ns/1.e6f,
973+
info.cmd_complete_duration_ns/1.e6f,
974+
info.cmd_total_duration_ns/1.e6f,
912975
info.global_size[0], info.global_size[1], info.global_size[2],
913976
info.local_size[0], info.local_size[2], info.local_size[2],
914977
info.output_size[0], info.output_size[1], info.output_size[2], info.output_size[3]);
915978
}
916979
fclose(fperf);
917980

918981
GGML_LOG_INFO("ggml_opencl: total kernel time: %f\n", total_kernel_time);
982+
983+
// Dump a simple chrome trace
984+
FILE* ftrace = fopen("cl_trace.json", "w");
985+
if (!ftrace) {
986+
GGML_LOG_ERROR("Failed to open cl_trace.json\n");
987+
return;
988+
}
989+
990+
fprintf(ftrace, "[\n");
991+
for (const ProfilingInfo & info : g_profiling_info) {
992+
fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Host\"},\n",
993+
info.kernel_name.c_str(), info.cmd_queued/1000);
994+
fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Host\"},\n",
995+
info.kernel_name.c_str(), info.cmd_submit/1000);
996+
997+
fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Device\"},\n",
998+
info.kernel_name.c_str(), info.cmd_start/1000);
999+
fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Device\"},\n",
1000+
info.kernel_name.c_str(), info.cmd_end/1000);
1001+
}
1002+
fclose(ftrace);
9191003
#endif
9201004
}
9211005

@@ -2062,25 +2146,14 @@ static void dump_tensor(ggml_backend_t backend, const struct ggml_tensor * tenso
20622146
// Profiling utility
20632147
//------------------------------------------------------------------------------
20642148
#ifdef GGML_OPENCL_PROFILING
2065-
void populateProfilingInfo(
2149+
static void populateProfilingInfo(
20662150
ProfilingInfo& info, cl_event evt, cl_kernel kernel,
20672151
size_t global_size[3], size_t local_size[3],
20682152
const ggml_tensor * tensor) {
2069-
cl_ulong start;
2070-
cl_ulong end;
2071-
CL_CHECK(clWaitForEvents(1, &evt));
2072-
CL_CHECK(clGetEventProfilingInfo(
2073-
evt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL));
2074-
CL_CHECK(clGetEventProfilingInfo(
2075-
evt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL));
2076-
2077-
char kernel_name[512];
2078-
CL_CHECK(clGetKernelInfo(kernel, CL_KERNEL_FUNCTION_NAME,
2079-
sizeof(kernel_name), kernel_name, NULL));
2080-
2081-
info.duration_ns = end - start;
2082-
info.op_name = tensor->name;
2083-
info.kernel_name = kernel_name;
2153+
info.op_name = tensor->name;
2154+
info.kernel = kernel;
2155+
info.evt = evt;
2156+
20842157
info.local_size[0] = local_size[0];
20852158
info.local_size[1] = local_size[1];
20862159
info.local_size[2] = local_size[2];

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.