Skip to content

Instantly share code, notes, and snippets.

@stas00
Created June 24, 2021 00:18
Show Gist options
  • Save stas00/21909c2e4866b709c36f3ea02fb99432 to your computer and use it in GitHub Desktop.
Save stas00/21909c2e4866b709c36f3ea02fb99432 to your computer and use it in GitHub Desktop.
build_table: put name column last, make cols more narrow
# torch/autograd/profiler.py
def build_table(
events,
sort_by=None,
header=None,
row_limit=100,
max_src_column_width=75,
with_flops=False,
profile_memory=False,
top_level_events_only=False):
"""Prints a summary of events (which can be a list of FunctionEvent or FunctionEventAvg)."""
if len(events) == 0:
return ""
has_cuda_time = any([event.self_cuda_time_total > 0 for event in events])
has_cuda_mem = any([event.self_cuda_memory_usage > 0 for event in events])
has_input_shapes = any(
[(event.input_shapes is not None and len(event.input_shapes) > 0) for event in events])
if sort_by is not None:
events = EventList(sorted(
events, key=lambda evt: getattr(evt, sort_by), reverse=True
), use_cuda=has_cuda_time, profile_memory=profile_memory, with_flops=with_flops)
MAX_NAME_COLUMN_WIDTH = 55
name_column_width = max([len(evt.key) for evt in events]) + 4
name_column_width = min(name_column_width, MAX_NAME_COLUMN_WIDTH)
MAX_SHAPES_COLUMN_WIDTH = 80
shapes_column_width = max([len(str(evt.input_shapes)) for evt in events]) + 4
shapes_column_width = min(shapes_column_width, MAX_SHAPES_COLUMN_WIDTH)
DEFAULT_COLUMN_WIDTH = 10
flops_column_width = DEFAULT_COLUMN_WIDTH
src_column_width = None
stacks = []
for evt in events:
if evt.stack is not None and len(evt.stack) > 0:
stacks.append(evt.stack)
has_stack = len(stacks) > 0
if has_stack:
src_column_width = max([max([len(entry) for entry in stack]) for stack in stacks]) + 4
src_column_width = min(src_column_width, max_src_column_width)
headers = [
'Self CPU %',
'Self CPU',
'CPU total %',
'CPU total',
'CPU time avg',
]
if has_cuda_time:
headers.extend([
'Self CUDA',
'Self CUDA %',
'CUDA total',
'CUDA time avg',
])
if profile_memory:
headers.extend([
'CPU Mem',
'Self CPU Mem',
])
if has_cuda_mem:
headers.extend([
'CUDA Mem',
'Self CUDA Mem',
])
headers.append(
'# of Calls'
)
# Only append Node ID if any event has a valid (>= 0) Node ID
append_node_id = any([evt.node_id != -1 for evt in events])
if append_node_id:
headers.append('Node ID')
# Have to use a list because nonlocal is Py3 only...
SPACING_SIZE = 2
row_format_lst = [""]
header_sep_lst = [""]
line_length_lst = [-SPACING_SIZE]
MAX_STACK_ENTRY = 5
def add_column(padding, text_dir='>'):
row_format_lst[0] += '{: ' + text_dir + str(padding) + '}' + (' ' * SPACING_SIZE)
header_sep_lst[0] += '-' * padding + (' ' * SPACING_SIZE)
line_length_lst[0] += padding + SPACING_SIZE
def auto_scale_flops(flops):
flop_headers = [
'FLOPS',
'KFLOPS',
'MFLOPS',
'GFLOPS',
'TFLOPS',
'PFLOPS',
]
assert flops > 0
log_flops = max(0, min(math.log10(flops) / 3, float(len(flop_headers) - 1)))
assert log_flops >= 0 and log_flops < len(flop_headers)
return (pow(10, (math.floor(log_flops) * -3.0)), flop_headers[int(log_flops)])
def flops_rate(evt):
US_IN_SECOND = 1000.0 * 1000.0
if evt.flops > 0:
if evt.cuda_time_total != 0:
return float(evt.flops) / evt.cuda_time_total * US_IN_SECOND
else:
return float(evt.flops) / evt.cpu_time_total * US_IN_SECOND
else:
return -1
for _ in headers[0:]:
add_column(DEFAULT_COLUMN_WIDTH)
if has_input_shapes:
headers.append('Input Shapes')
add_column(shapes_column_width)
if has_stack:
headers.append('Source Location')
add_column(src_column_width, text_dir='<')
if with_flops:
# Auto-scaling of flops header
raw_flops = []
for evt in events:
rate = flops_rate(evt)
if rate > 0:
raw_flops.append(rate)
if len(raw_flops) != 0:
(flops_scale, flops_header) = auto_scale_flops(min(raw_flops))
headers.append(flops_header)
add_column(flops_column_width)
else:
with_flops = False # can't find any valid flops
# name last so that it could span long
headers.append(
'Name',
)
add_column(name_column_width, text_dir='<')
row_format = row_format_lst[0]
header_sep = header_sep_lst[0]
line_length = line_length_lst[0]
add_column = None # type: ignore[assignment]
# Have to use a list because nonlocal is Py3 only...
result = []
def append(s):
result.append(s)
result.append('\n') # Yes, newline after the end as well
sum_self_cpu_time_total = sum([event.self_cpu_time_total for event in events])
sum_self_cuda_time_total = 0
for evt in events:
if evt.device_type == DeviceType.CPU:
# in legacy profiler, kernel info is stored in cpu events
if evt.is_legacy:
sum_self_cuda_time_total += evt.self_cuda_time_total
elif evt.device_type == DeviceType.CUDA:
# in kineto profiler, there're events with the correct device type (e.g. CUDA)
sum_self_cuda_time_total += evt.self_cuda_time_total
# Actual printing
if header is not None:
append('=' * line_length)
append(header)
if top_level_events_only:
append('=' * line_length)
append('This report only display top-level ops statistics')
append(header_sep)
append(row_format.format(*headers))
append(header_sep)
def trim_path(path, src_column_width):
if len(path) > src_column_width:
offset = len(path) - src_column_width
path = path[offset:]
if len(path) > 3:
path = "..." + path[3:]
return path
event_limit = 0
for evt in events:
if event_limit == row_limit:
break
if top_level_events_only and evt.cpu_parent is not None:
continue
else:
event_limit += 1
name = evt.key
if len(name) >= MAX_NAME_COLUMN_WIDTH - 3:
name = name[:(MAX_NAME_COLUMN_WIDTH - 3)] + "..."
row_values = [
# Self CPU total %, 0 for async events.
format_time_share(evt.self_cpu_time_total,
sum_self_cpu_time_total),
evt.self_cpu_time_total_str, # Self CPU total
# CPU total %, 0 for async events.
format_time_share(evt.cpu_time_total, sum_self_cpu_time_total) if not evt.is_async else 0,
evt.cpu_time_total_str, # CPU total
evt.cpu_time_str, # CPU time avg
]
if has_cuda_time:
row_values.extend([
evt.self_cuda_time_total_str,
# CUDA time total %
format_time_share(evt.self_cuda_time_total, sum_self_cuda_time_total),
evt.cuda_time_total_str,
evt.cuda_time_str, # Cuda time avg
])
if profile_memory:
row_values.extend([
# CPU Mem Total
format_memory(evt.cpu_memory_usage),
# Self CPU Mem Total
format_memory(evt.self_cpu_memory_usage),
])
if has_cuda_mem:
row_values.extend([
# CUDA Mem Total
format_memory(evt.cuda_memory_usage),
# Self CUDA Mem Total
format_memory(evt.self_cuda_memory_usage),
])
row_values.append(
evt.count, # Number of calls
)
if append_node_id:
row_values.append(evt.node_id)
if has_input_shapes:
row_values.append(str(evt.input_shapes)[:shapes_column_width])
if with_flops:
rate = flops_rate(evt)
if rate <= 0.0:
row_values.append("--")
else:
row_values.append('{0:8.3f}'.format(rate * flops_scale))
if has_stack:
src_field = ""
if len(evt.stack) > 0:
src_field = trim_path(evt.stack[0], src_column_width)
row_values.append(src_field)
# name last
row_values.append(
name,
)
append(row_format.format(*row_values))
if has_stack:
empty_headers = [""] * (len(headers) - 1)
for entry in evt.stack[1:MAX_STACK_ENTRY]:
append(row_format.format(*(empty_headers + [trim_path(entry, src_column_width)])))
empty_headers.append("")
append(row_format.format(*empty_headers))
append(header_sep)
append("Self CPU time total: {}".format(format_time(sum_self_cpu_time_total)))
if has_cuda_time:
append("Self CUDA time total: {}".format(format_time(sum_self_cuda_time_total)))
return ''.join(result)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment