Last active
July 7, 2020 23:42
-
-
Save garthk/8ba26f5bd0d81196dd3b47020a9a207a to your computer and use it in GitHub Desktop.
Using telemetry to capture OpenCensus trace span in Elixir
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
defmodule TelemetryTracingExperimentTest do | |
use ExUnit.Case, async: true | |
@doc """ | |
Dump a variable to standard output, ANSI formatted, and pretty. | |
""" | |
def dump(v) do | |
opts = IEx.Config.inspect_opts() | |
[:reset, "\n", get_in(opts, [:syntax_colors, :reset]) || :reset, inspect(v, opts), :reset] | |
|> IO.ANSI.format() | |
|> IO.puts() | |
end | |
@doc """ | |
Dump a string back out, formatted as the function call. | |
""" | |
def explain(s) do | |
opts = IEx.Config.inspect_opts() | |
[ | |
:reset, | |
"\nexplain ", | |
get_in(opts, [:syntax_colors, :string]) || :reset, | |
"\"\"\"", | |
for line <- s |> String.trim_trailing() |> String.split("\n") do | |
["\n", line] | |
end, | |
"\n\"\"\"", | |
:reset | |
] | |
|> IO.ANSI.format() | |
|> IO.puts() | |
end | |
@doc """ | |
Dump some code to standard output, then run it, and `dump/1` its result also. | |
""" | |
defmacro loudly(do: block) do | |
code = | |
Macro.to_string( | |
quote generated: true do | |
loudly do | |
unquote(block) | |
end | |
end | |
) | |
quote do | |
IO.puts(["\n", unquote(code)]) | |
dump(unquote(block)) | |
end | |
end | |
@tag :experiments | |
test "the whole thing" do | |
explain(""" | |
How long is a second, in :timer native units? Let's measure a tenth of a second: | |
""") | |
loudly do | |
{d, :ok} = :timer.tc(fn -> 1 |> :timer.seconds() |> div(10) |> :timer.sleep() end) | |
d * 10 | |
end | |
explain(""" | |
1_000_000 and change, right? Looks like microseconds. Let's shift it to milliseconds. | |
If we get 100, we're good: | |
""") | |
loudly do | |
System.convert_time_unit(d, :microsecond, :millisecond) | |
end | |
explain(""" | |
How many times can we call :telemetry.span/3 in a second if there are no handlers? | |
First, let's enforce our assumptions: | |
""") | |
loudly do | |
event_prefix = [:d597a6960262870e] | |
for %{id: id} <- :telemetry.list_handlers(event_prefix), do: :telemetry.detach(id) | |
end | |
explain(""" | |
If we separate out, as much as possible: | |
* The code being measured (`{:ok, arg}`) | |
* The code instrumenting it with `:telemetry.span/3` (the rest of `measure_me`) | |
* The benchmark (the call to `:timer.tc/1`) | |
... and measure 10K results, we'll get a rough idea of the overhead: | |
""") | |
loudly do | |
measure_me = fn arg -> | |
ref = make_ref() | |
start_and_exception_metadata = %{arg: arg, ref: ref} | |
:telemetry.span(event_prefix, start_and_exception_metadata, fn -> | |
# THIS IS OUR ONLY LINE OF BUSINESS LOGIC CODE: | |
result = {:ok, arg} | |
# Back to the overhead: | |
stop_metadata = %{arg: arg, ref: ref, result: result} | |
{result, stop_metadata} | |
end) | |
end | |
count = 10_000 | |
{d, result} = :timer.tc(fn -> for i <- 1..count, do: measure_me.(i) end) | |
assert length(result) == count | |
assert hd(result) == {:ok, 1} | |
assert result |> Enum.reverse() |> hd() == {:ok, count} | |
%{calls_per_sec: div(1_000_000 * count, d), each_µs: Float.floor(d / count, 1)} | |
end | |
explain(""" | |
~900K per second on a server CPU? Library vendors should feel safe sprinkling `:telemetry` | |
about. Anyone attaching handlers should think of the extra impact, though. What if there's a | |
handler attached to :start? | |
""") | |
loudly do | |
handler_id = make_ref() | |
handler = fn _n, _m10s, _m6a, _c -> :... end | |
:telemetry.attach(handler_id, event_prefix ++ [:start], handler, nil) | |
{d, _} = :timer.tc(fn -> for i <- 1..count, do: measure_me.(i) end) | |
:telemetry.detach(handler_id) | |
%{calls_per_sec: div(1_000_000 * count, d), each_µs: Float.floor(d / count, 1)} | |
end | |
explain(""" | |
~800K calls per second isn't bad, either. | |
How about three handlers, one of which is almost never called? | |
""") | |
loudly do | |
handler_id = make_ref() | |
handler = fn _n, _m10s, _m6a, _c -> :... end | |
:telemetry.attach(handler_id, event_prefix ++ [:start], handler, nil) | |
:telemetry.attach(handler_id, event_prefix ++ [:stop], handler, nil) | |
:telemetry.attach(handler_id, event_prefix ++ [:exception], handler, nil) | |
{d, _} = :timer.tc(fn -> for i <- 1..count, do: measure_me.(i) end) | |
:telemetry.detach(handler_id) | |
%{calls_per_sec: div(1_000_000 * count, d), each_µs: Float.floor(d / count, 1)} | |
end | |
explain(""" | |
Three handlers that try to start and stop spans? | |
""") | |
loudly do | |
handler_id = make_ref() | |
handle_start = fn _n, _m10s, %{ref: _}, _c -> | |
:ocp.with_child_span("benchmark") | |
end | |
handle_stop = fn _n, _m10s, %{ref: _}, _c -> | |
:ocp.set_status(0, "OK") | |
:ocp.finish_span() | |
end | |
handle_exception = fn _n, _m10s, %{ref: _}, _c -> | |
:ocp.set_status(2, "FAIL") | |
:ocp.finish_span() | |
end | |
:telemetry.attach({handler_id, :start}, event_prefix ++ [:start], handle_start, nil) | |
:telemetry.attach({handler_id, :stop}, event_prefix ++ [:stop], handle_stop, nil) | |
:telemetry.attach( | |
{handler_id, :exception}, | |
event_prefix ++ [:exception], | |
handle_exception, | |
nil | |
) | |
{d, _} = :timer.tc(fn -> for i <- 1..count, do: measure_me.(i) end) | |
:telemetry.detach(handler_id) | |
%{calls_per_sec: div(1_000_000 * count, d), each_µs: Float.floor(d / count, 1)} | |
end | |
explain(""" | |
~50K? Ouch! Not bad if we're wrapping something taking 1ms (~2% overhead?) or more, | |
but tracing is definitely heavier than telemetry. | |
We got ahead of ourselves, though. How about if we did tracing directly? | |
""") | |
loudly do | |
self_tracing = fn arg -> | |
:ocp.with_child_span("benchmark") | |
try do | |
{:ok, arg} | |
after | |
:ocp.set_status(0, "OK") | |
:ocp.finish_span() | |
end | |
end | |
{d, _} = :timer.tc(fn -> for i <- 1..count, do: self_tracing.(i) end) | |
%{calls_per_sec: div(1_000_000 * count, d), each_µs: Float.floor(d / count, 1)} | |
end | |
explain(""" | |
Faster than doing it via :telemetry.span/3, but not that much faster. Back to the original | |
version, let's get defensive. TODO explain concerns. | |
""") | |
loudly do | |
pdict_key = :d597a6960262870e | |
handler_id = make_ref() | |
handle_start = fn _n, _m10s, %{ref: ref}, _c -> | |
previous_span_ctx = :ocp.current_span_ctx() | |
span_ctx = :oc_trace.start_span("benchmark", previous_span_ctx) | |
:ocp.with_span_ctx(span_ctx) | |
pdict_key | |
|> Process.get(%{}) | |
|> put_in([ref], {previous_span_ctx, span_ctx}) | |
|> Process.put(pdict_key) | |
end | |
handle_stop = fn _n, _m10s, %{ref: ref}, _c -> | |
state = Process.get(pdict_key, %{}) | |
{previously, state} = Map.pop(state, ref) | |
Process.put(pdict_key, state) | |
case {previously, :ocp.current_span_ctx()} do | |
{nil, span_ctx} -> | |
# can't be defensive; started without ref? | |
previous_span_ctx = :oc_trace.parent_span_ctx(span_ctx) | |
:oc_trace.set_status(0, "OK", span_ctx) | |
:oc_trace.finish_span(span_ctx) | |
:ocp.with_span_ctx(previous_span_ctx) | |
{{previous_span_ctx, span_ctx}, span_ctx} -> | |
# nothing bad happened! | |
:oc_trace.set_status(0, "OK", span_ctx) | |
:oc_trace.finish_span(span_ctx) | |
:ocp.with_span_ctx(previous_span_ctx) | |
{{previous_span_ctx, span_ctx}, other_span_ctx} -> | |
IO.warn("span mismatch") | |
# expecting more push than pop, let's wind up the other one | |
:oc_trace.set_status(2, "UNKNOWN", other_span_ctx) | |
:oc_trace.finish_span(other_span_ctx) | |
# ... and then back to normal: | |
:oc_trace.set_status(0, "OK", span_ctx) | |
:oc_trace.finish_span(span_ctx) | |
:ocp.with_span_ctx(previous_span_ctx) | |
end | |
end | |
handle_exception = fn _n, _m10s, %{ref: _}, _c -> | |
# TODO also do all that defensive span popping: | |
:ocp.set_status(2, "FAIL") | |
:ocp.finish_span() | |
end | |
:telemetry.attach({handler_id, :start}, event_prefix ++ [:start], handle_start, nil) | |
:telemetry.attach({handler_id, :stop}, event_prefix ++ [:stop], handle_stop, nil) | |
:telemetry.attach( | |
{handler_id, :exception}, | |
event_prefix ++ [:exception], | |
handle_exception, | |
nil | |
) | |
{d, _} = :timer.tc(fn -> for i <- 1..count, do: measure_me.(i) end) | |
:telemetry.detach(handler_id) | |
%{calls_per_sec: div(1_000_000 * count, d), each_µs: Float.floor(d / count, 1)} | |
end | |
end | |
end |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
defmodule ViaTelemetryTest do | |
@moduledoc "Tracing hooked up with Telemetry, OpenTelemetry version." | |
use ExUnit.Case, async: true | |
alias OpenTelemetry.Tracer | |
require OpenTelemetry.Tracer | |
@doc """ | |
Dump a variable to standard output, ANSI formatted, and pretty. | |
""" | |
def dump(v) do | |
opts = IEx.Config.inspect_opts() | |
[:reset, "\n", get_in(opts, [:syntax_colors, :reset]) || :reset, inspect(v, opts), :reset] | |
|> IO.ANSI.format() | |
|> IO.puts() | |
end | |
@doc """ | |
Dump a string back out, formatted as the function call. | |
""" | |
def explain(s) do | |
opts = IEx.Config.inspect_opts() | |
[ | |
:reset, | |
"\nexplain ", | |
get_in(opts, [:syntax_colors, :string]) || :reset, | |
"\"\"\"", | |
for line <- s |> String.trim_trailing() |> String.split("\n") do | |
["\n", line] | |
end, | |
"\n\"\"\"", | |
:reset | |
] | |
|> IO.ANSI.format() | |
|> IO.puts() | |
end | |
@doc """ | |
Dump some code to standard output, then run it, and `dump/1` its result also. | |
""" | |
defmacro loudly(do: block) do | |
code = | |
Macro.to_string( | |
quote generated: true do | |
loudly do | |
unquote(block) | |
end | |
end | |
) | |
quote do | |
IO.puts(["\n", unquote(code)]) | |
unquote(block) | |
|> dump() | |
end | |
end | |
@tag :experiments | |
test "the whole thing" do | |
explain """ | |
TODO explain why I'm checking this | |
""" | |
loudly do | |
:opentelemetry.get_tracer(__MODULE__) | |
end | |
explain """ | |
How long is a second, in :timer native units? Let's measure a tenth of a second: | |
""" | |
loudly do | |
{d, :ok} = :timer.tc(fn -> 1 |> :timer.seconds() |> div(10) |> :timer.sleep() end) | |
d * 10 | |
end | |
explain """ | |
1_000_000 and change, right? Looks like microseconds. Let's shift it to milliseconds. | |
If we get 100, we're good: | |
""" | |
loudly do | |
System.convert_time_unit(d, :microsecond, :millisecond) | |
end | |
explain """ | |
How many times can we call :telemetry.span/3 in a second if there are no handlers? | |
First, let's enforce our assumptions: | |
""" | |
loudly do | |
event_prefix = [:d597a6960262870e] | |
for %{id: id} <- :telemetry.list_handlers(event_prefix), do: :telemetry.detach(id) | |
end | |
explain """ | |
If we separate out, as much as possible: | |
* The code being measured (`{:ok, arg}`) | |
* The code instrumenting it with `:telemetry.span/3` (the rest of `measure_me`) | |
* The benchmark (the call to `:timer.tc/1`) | |
... and measure 10K results, we'll get a rough idea of the overhead: | |
""" | |
loudly do | |
measure_me = fn arg -> | |
ref = make_ref() | |
start_and_exception_metadata = %{arg: arg, ref: ref} | |
:telemetry.span(event_prefix, start_and_exception_metadata, fn -> | |
# THIS IS OUR ONLY LINE OF BUSINESS LOGIC CODE: | |
result = {:ok, arg} | |
# Back to the overhead: | |
stop_metadata = %{arg: arg, ref: ref, result: result} | |
{result, stop_metadata} | |
end) | |
end | |
count = 10_000 | |
{d, result} = :timer.tc(fn -> for i <- 1..count, do: measure_me.(i) end) | |
assert length(result) == count | |
assert hd(result) == {:ok, 1} | |
assert result |> Enum.reverse() |> hd() == {:ok, count} | |
%{calls_per_sec: div(1_000_000 * count, d), each_µs: Float.floor(d / count, 1)} | |
end | |
explain """ | |
~900K per second on a server CPU? Library vendors should feel safe sprinkling `:telemetry` | |
about. Anyone attaching handlers should think of the extra impact, though. What if there's a | |
handler attached to :start? | |
""" | |
loudly do | |
handler_id = make_ref() | |
handler = fn _n, _m10s, _m6a, _c -> :... end | |
:telemetry.attach(handler_id, event_prefix ++ [:start], handler, nil) | |
{d, result} = :timer.tc(fn -> for i <- 1..count, do: measure_me.(i) end) | |
:telemetry.detach(handler_id) | |
%{calls_per_sec: div(1_000_000 * count, d), each_µs: Float.floor(d / count, 1)} | |
end | |
explain """ | |
~800K calls per second isn't bad, either. | |
How about three handlers, one of which is almost never called? | |
""" | |
loudly do | |
handler_id = make_ref() | |
handler = fn _n, _m10s, _m6a, _c -> :... end | |
:telemetry.attach(handler_id, event_prefix ++ [:start], handler, nil) | |
:telemetry.attach(handler_id, event_prefix ++ [:stop], handler, nil) | |
:telemetry.attach(handler_id, event_prefix ++ [:exception], handler, nil) | |
{d, result} = :timer.tc(fn -> for i <- 1..count, do: measure_me.(i) end) | |
:telemetry.detach(handler_id) | |
%{calls_per_sec: div(1_000_000 * count, d), each_µs: Float.floor(d / count, 1)} | |
end | |
explain """ | |
Three handlers that try to start and stop spans? | |
""" | |
loudly do | |
handler_id = make_ref() | |
handle_start = fn _n, _m10s, %{ref: _}, _c -> | |
Tracer.start_span("benchmark") | |
end | |
handle_stop = fn _n, _m10s, %{ref: _}, _c -> | |
Tracer.end_span() | |
end | |
handle_exception = fn _n, _m10s, %{ref: _}, _c -> | |
Tracer.set_status("Unknown") | |
Tracer.end_span() | |
end | |
:telemetry.attach({handler_id, :start}, event_prefix ++ [:start], handle_start, nil) | |
:telemetry.attach({handler_id, :stop}, event_prefix ++ [:stop], handle_stop, nil) | |
:telemetry.attach( | |
{handler_id, :exception}, | |
event_prefix ++ [:exception], | |
handle_exception, | |
nil | |
) | |
{d, result} = :timer.tc(fn -> for i <- 1..count, do: measure_me.(i) end) | |
for event <- [:start, :stop, :exception], do: :telemetry.detach({handler_id, event}) | |
%{calls_per_sec: div(1_000_000 * count, d), each_µs: Float.floor(d / count, 1)} | |
end | |
loudly do | |
{:embarrassing_trailing_span_ctx?, OpenTelemetry.Tracer.current_span_ctx()} | |
end | |
explain """ | |
~50K? Ouch! Not bad if we're wrapping something taking 1ms (~2% overhead?) or more, | |
but tracing is definitely heavier than telemetry. | |
We got ahead of ourselves, though. How about if we did tracing directly? | |
""" | |
loudly do | |
self_tracing = fn arg -> | |
Tracer.start_span("benchmark") | |
try do | |
{:ok, arg} | |
after | |
Tracer.end_span() | |
end | |
end | |
{d, result} = :timer.tc(fn -> for i <- 1..count, do: self_tracing.(i) end) | |
%{calls_per_sec: div(1_000_000 * count, d), each_µs: Float.floor(d / count, 1)} | |
end | |
loudly do | |
{:embarrassing_trailing_span_ctx?, OpenTelemetry.Tracer.current_span_ctx()} | |
end | |
explain """ | |
Faster than doing it via :telemetry.span/3, but not that much faster. Back to the original | |
version, let's get defensive. TODO explain concerns. | |
""" | |
loudly do | |
# :ok = :dbg.stop_clear() | |
# {:ok, _} = | |
# :dbg.tracer( | |
# :process, | |
# {fn message, :state -> | |
# dump(message) | |
# :state | |
# end, :state} | |
# ) | |
for m <- [:ot_tracer, Process], do: :dbg.tp(m, [{:_, [], [{:return_trace}]}]) | |
:dbg.p(:all, :c) | |
pdict_key = :d597a6960262870e | |
handler_id = make_ref() | |
tracer = :opentelemetry.get_tracer(__MODULE__) | |
handle_start = fn _n, _m10s, %{ref: ref}, _c -> | |
# get the previous span_ctx manually | |
# TODO find way to reach create_span to avoid repeating :ot_ctx.get_value/2 | |
previous_span_ctx = :ot_tracer.current_span_ctx(tracer) | |
span_ctx = :ot_tracer.start_span(tracer, "benchmark", %{}) | |
state = Process.get(pdict_key, %{}) | |
state = put_in(state, [ref], {previous_span_ctx, span_ctx}) | |
Process.put(pdict_key, state) | |
end | |
handle_stop = fn _n, _m10s, %{ref: ref}, _c -> | |
state = Process.get(pdict_key, %{}) | |
{previously, state} = Map.pop(state, ref) | |
Process.put(pdict_key, state) | |
case {previously, Tracer.current_span_ctx()} do | |
{nil, span_ctx} -> | |
IO.puts("NO PREVIOUS #{__ENV__.file}:#{__ENV__.line}") | |
# can't be defensive; started without ref? | |
:ot_tracer.end_span(tracer, span_ctx) | |
# automatically sets parent? | |
# TODO remove crasher: | |
0 = 1 | |
{{previous_span_ctx, span_ctx}, span_ctx} -> | |
# nothing bad happened! end span, implicitly taking previous_span_ctx | |
:ot_tracer.end_span(tracer, span_ctx) | |
# assert :ot_tracer.current_span_ctx(tracer) == previous_span_ctx | |
{{previous_span_ctx, span_ctx}, other_span_ctx} -> | |
IO.puts("MISMATCH #{__ENV__.file}:#{__ENV__.line}") | |
IO.warn("span mismatch") | |
# expecting more push than pop, let's wind up the other one | |
:ot_span.set_status(tracer, other_span_ctx, "Unknown") | |
:ot_tracer.end_span(tracer, other_span_ctx) | |
# ... and then back to normal: | |
:ot_tracer.end_span(tracer, span_ctx) | |
# :ot_tracer.set_span(tracer, previous_span_ctx) | |
assert :ot_tracer.current_span_ctx(tracer) == previous_span_ctx | |
end | |
end | |
handle_exception = fn _n, _m10s, %{ref: _}, _c -> | |
# TODO also do all that defensive span popping: | |
:ocp.set_status(2, "FAIL") | |
Tracer.end_span() | |
end | |
:telemetry.attach({handler_id, :start}, event_prefix ++ [:start], handle_start, nil) | |
:telemetry.attach({handler_id, :stop}, event_prefix ++ [:stop], handle_stop, nil) | |
:telemetry.attach( | |
{handler_id, :exception}, | |
event_prefix ++ [:exception], | |
handle_exception, | |
nil | |
) | |
{d, _} = :timer.tc(fn -> for i <- 1..count, do: measure_me.(i) end) | |
for event <- [:start, :stop, :exception], do: :telemetry.detach({handler_id, event}) | |
:ok = :dbg.stop_clear() | |
%{calls_per_sec: div(1_000_000 * count, d), each_µs: Float.floor(d / count, 1)} | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
See also open-telemetry/opentelemetry-erlang-api#26