Last active
July 7, 2024 16:21
-
-
Save sax/2028fa6f8059cc4e0336fb890380f267 to your computer and use it in GitHub Desktop.
Elixir OpenTelemetry LiveView / crash tracking
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import Config | |
## .... | |
config :logger, | |
backends: [ | |
:console, | |
OpenTelemetryCrashLogger | |
] | |
config :logger, :open_telemetry_crash_logger, level: :error |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
defmodule MyApp.OpenTelemetry do | |
@moduledoc """ | |
Configures OpenTelemetry to work with Phoenix/LiveView. | |
""" | |
@dialyzer {:nowarn_function, connection_status: 1} | |
@dialyzer {:nowarn_function, create_child_span: 3} | |
@dialyzer {:nowarn_function, create_parent_ctx: 1} | |
@dialyzer {:nowarn_function, handle_exception: 4} | |
@dialyzer {:nowarn_function, handle_cast: 2} | |
@dialyzer {:nowarn_function, get_peer_data: 1} | |
@dialyzer {:nowarn_function, get_user_agent: 1} | |
@dialyzer {:nowarn_function, open_child_span: 4} | |
use GenServer | |
require Logger | |
require OpenTelemetry.Tracer | |
alias OpenTelemetry.Ctx | |
alias OpenTelemetry.Span | |
alias OpenTelemetry.Tracer | |
alias OpentelemetryPhoenix.Reason | |
def start_link(args) do | |
GenServer.start_link(__MODULE__, args, name: __MODULE__) | |
end | |
def init(_args) do | |
setup() | |
{:ok, nil} | |
end | |
@doc """ | |
Sets up telemetry attachments. Some bindings are registered via `OpentelemetryPhoenix` | |
and `OpentelemetryEcto`, but since neither of those handle LiveView we attach our own | |
bindings for those events. | |
Some bindings are attached before the library code, so that we can add some normalized | |
attributes across most events. | |
""" | |
def setup do | |
#### exception attachments need to be registered *before* we setup | |
#### OpentelemetryPhoenix or OpentelemetryEcto, so we can alter | |
#### the OT spans before those libraries end any spans | |
:telemetry.attach_many( | |
"opentelemetry-error-handler", | |
[ | |
[:phoenix, :endpoint, :exception], | |
[:phoenix, :router_dispatch, :exception] | |
], | |
&handle_errors/4, | |
[] | |
) | |
OpentelemetryPhoenix.setup() | |
OpentelemetryEcto.setup([:my_app, :repo], time_unit: :millisecond) | |
#### start attachments need to be registered *after* we setup | |
#### OpentelemetryPhoenix or OpentelemetryEcto, so OT spans are already | |
#### created for the current process. | |
:telemetry.attach_many( | |
"opentelemetry-phoenix-handler", | |
[ | |
[:my_app, :endpoint, :start], | |
[:phoenix, :endpoint, :start], | |
[:phoenix, :router_dispatch, :start], | |
[:phoenix, :error_rendered] | |
], | |
&add_ot_span_to_logger/4, | |
[] | |
) | |
#### Our custom telemetry handlers operate outside the scope of the | |
#### external libraries, so can be attached in any order. | |
:telemetry.attach_many( | |
"opentelemetry-exception-handler", | |
[ | |
[:my_app, :opentelemetry, :exit], | |
[:phoenix, :live_view, :handle_event, :exception], | |
[:phoenix, :live_view, :mount, :exception] | |
], | |
&handle_exception/4, | |
[] | |
) | |
:telemetry.attach_many( | |
"opentelemetry-start-handler", | |
[ | |
[:phoenix, :live_view, :handle_event, :start], | |
[:phoenix, :live_view, :mount, :start] | |
], | |
&open_child_span/4, | |
[] | |
) | |
:telemetry.attach_many( | |
"opentelemetry-stop-handler", | |
[ | |
[:phoenix, :live_view, :handle_event, :stop], | |
[:phoenix, :live_view, :mount, :stop] | |
], | |
&handle_success/4, | |
[] | |
) | |
end | |
@doc """ | |
Given a telemetry event signaling that an action has begun, open a new child span. | |
Makes sure that a parent span is either created, or attached to, so that events appear | |
as siblings in a single trace spanning the LiveView session. | |
Note that child spans pull attributes off of the socket that are only present if the | |
`:connect_info` is configured to include them in the Endpoint: | |
socket "/live", Phoenix.LiveView.Socket, | |
websocket: [ | |
connect_info: [ | |
:peer_data, | |
:trace_context_headers, | |
:user_agent, | |
:x_headers, | |
session: @session_options | |
] | |
] | |
""" | |
def open_child_span([:phoenix, :live_view, :handle_event, :start], _payload, meta, _config) do | |
get_parent_ctx() | |
|> create_child_span("HANDLE_EVENT #{meta.event}", meta.socket) | |
end | |
def open_child_span([:phoenix, :live_view, :mount, :start], _payload, meta, _config) do | |
meta.socket | |
|> create_parent_ctx() | |
|> create_child_span("MOUNT #{to_module(meta.socket.view)}", meta.socket) | |
end | |
defp create_child_span(parent_ctx, span_name, socket) do | |
new_ctx = Tracer.start_span(parent_ctx, span_name, %{kind: :SERVER}) | |
_ = Tracer.set_current_span(new_ctx) | |
spans = | |
Logger.metadata() | |
|> Keyword.get(:ot_spans, []) | |
|> List.insert_at(0, new_ctx) | |
Logger.metadata(ot_spans: spans) | |
peer_data = get_peer_data(socket) | |
user_agent = get_user_agent(socket) | |
# peer_ip = Map.get(peer_data, :address) | |
attributes = [ | |
# "http.client_ip": client_ip(conn), | |
# "http.flavor": http_flavor(adapter), | |
"http.host": socket.host_uri.host, | |
# "http.method": conn.method, | |
"http.scheme": socket.host_uri.scheme, | |
# "http.target": conn.request_path, | |
"http.user_agent": user_agent, | |
"live_view.connection_status": connection_status(socket), | |
# "net.host.ip": to_string(:inet_parse.ntoa(conn.remote_ip)), | |
"net.host.port": socket.host_uri.port, | |
# "net.peer.ip": to_string(:inet_parse.ntoa(peer_ip)), | |
"net.peer.port": peer_data.port, | |
"net.transport": :"IP.TCP" | |
] | |
Span.set_attributes(new_ctx, attributes) | |
end | |
defp create_parent_ctx(socket) do | |
parent_span = Tracer.start_span("LIVE #{to_module(socket.view)}", %{kind: :SERVER}) | |
parent_ctx = Tracer.set_current_span(Ctx.new(), parent_span) | |
_prev_ctx = Ctx.attach(parent_ctx) | |
Span.end_span(parent_span) | |
Process.put(:ot_parent_ctx, parent_ctx) | |
parent_ctx | |
end | |
defp get_parent_ctx do | |
parent_ctx = Process.get(:ot_parent_ctx) | |
_prev_ctx = Ctx.attach(parent_ctx) | |
parent_ctx | |
end | |
@doc """ | |
Caught exits are cast to ourselves, so that we do not interrupt the logger. | |
""" | |
def handle_cast({:caught_exit, _payload, %{spans: spans, reason: reason, stacktrace: stacktrace}}, state) do | |
crash_attrs = | |
Keyword.merge( | |
[ | |
type: :exit, | |
stacktrace: Exception.format_stacktrace(stacktrace) | |
], | |
Reason.normalize(reason) | |
) | |
for span_ctx <- spans do | |
Span.set_attribute(span_ctx, :status, :crash) | |
Span.add_event(span_ctx, "crash", crash_attrs) | |
Span.set_status(span_ctx, OpenTelemetry.status(:Error, "Error")) | |
OpenTelemetry.Span.end_span(span_ctx) | |
end | |
{:noreply, state} | |
end | |
@doc """ | |
Intended for exceptions that happen during spans that have been opened by OpentelemetryPhoenix or OpentelemetryEcto. | |
Those libraries do not put a high-level `:status` key in the data, which we do for our spans to normalize across different | |
event types. | |
""" | |
def handle_errors(_event, _payload, _metadata, _config) do | |
span_ctx = Tracer.current_span_ctx() | |
Span.set_attribute(span_ctx, :status, :error) | |
Span.end_span(span_ctx) | |
end | |
@doc """ | |
Catch message around exits and exceptions, to make sure that spans are marked with crash/exception status and | |
closed. | |
""" | |
def handle_exception([:my_app, :opentelemetry, :exit], payload, metadata, _config) do | |
GenServer.cast(__MODULE__, {:caught_exit, payload, metadata}) | |
end | |
def handle_exception([:phoenix, :live_view, _, :exception], _payload, %{kind: kind, reason: reason, stacktrace: stacktrace}, _config) do | |
exception_attrs = | |
Keyword.merge( | |
[ | |
type: kind, | |
stacktrace: Exception.format_stacktrace(stacktrace) | |
], | |
Reason.normalize(reason) | |
) | |
status = OpenTelemetry.status(:Error, "Error") | |
span_ctx = Tracer.current_span_ctx() | |
Span.set_attribute(span_ctx, :status, :error) | |
Span.add_event(span_ctx, "exception", exception_attrs) | |
Span.set_status(span_ctx, status) | |
Span.end_span(span_ctx) | |
pop_span_ctx() | |
end | |
@doc """ | |
Set status to `:ok` when a `:stop` event fires, which suggests that whatever happened | |
was successful, ie did not crash or raise. | |
""" | |
def handle_success([:phoenix, :live_view, _, :stop], _payload, _meta, _config) do | |
span_ctx = Tracer.current_span_ctx() | |
Span.set_attribute(span_ctx, :status, :ok) | |
Span.end_span(span_ctx) | |
pop_span_ctx() | |
end | |
@doc """ | |
When crashes or exits occur, rather than exceptions, no telemetry fires to close open | |
spans. To get around this, we keep a list of open spans in the `Logger.metadata`, and | |
check for crashes in a `OpenTelemetryCrashLogger`. | |
If a crash occurs where any `:ot_spans` live in the metadata, a `[:my_app, :opentelemetry, :exit]` | |
telemetry event is fired, which gets picked up by our `handle_exception` binding. | |
""" | |
def add_ot_span_to_logger(event, _payload, _meta, _config) do | |
Tracer.current_span_ctx() | |
|> case do | |
:undefined -> | |
Logger.warn("[#{__MODULE__}] (#{inspect(event)}) expected current process have an OpenTelemetry span, but does not") | |
span_ctx -> | |
spans = | |
Logger.metadata() | |
|> Keyword.get(:ot_spans, []) | |
|> List.insert_at(0, span_ctx) | |
Logger.metadata(ot_spans: spans) | |
end | |
end | |
#### See if any of this info is present on a socket in the context | |
#### of GCP / DO. | |
# defp client_ip(%{remote_ip: remote_ip} = conn) do | |
# case Plug.Conn.get_req_header(conn, "x-forwarded-for") do | |
# [] -> | |
# to_string(:inet_parse.ntoa(remote_ip)) | |
# [client | _] -> | |
# client | |
# end | |
# end | |
defp pop_span_ctx do | |
spans = | |
Logger.metadata() | |
|> Keyword.get(:ot_spans, []) | |
|> case do | |
[_span | rest] -> rest | |
[] -> [] | |
end | |
Logger.metadata(ot_spans: spans) | |
spans | |
|> case do | |
[parent | _] -> parent | |
_ -> :undefined | |
end | |
|> Tracer.set_current_span() | |
end | |
defp get_peer_data(%{private: %{connect_info: %{peer_data: peer_data}}}), do: peer_data | |
defp get_peer_data(_), do: %{port: nil, address: nil, ssl_cert: nil} | |
defp get_user_agent(%{private: %{connect_info: %{user_agent: user_agent}}}), do: user_agent | |
defp get_user_agent(_), do: "" | |
defp connection_status(%{connected?: true}), do: "connected" | |
defp connection_status(%{connected?: false}), do: "disconnected" | |
defp to_module(module) do | |
module | |
|> to_string() | |
|> case do | |
"Elixir." <> name -> name | |
erlang_module -> ":#{erlang_module}" | |
end | |
end | |
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
defmodule OpenTelemetryCrashLogger do | |
@moduledoc """ | |
Try to capture exits and report them in OpenTelemetry. | |
References: | |
* https://github.com/elixir-lang/elixir/blob/v1.11.3/lib/logger/lib/logger/backends/console.ex | |
""" | |
@behaviour :gen_event | |
defstruct level: nil | |
def init(__MODULE__), do: init({__MODULE__, :open_telemetry_crash_logger}) | |
def init({__MODULE__, config_name}) do | |
config = Application.get_env(:logger, config_name) | |
{:ok, new(config, %__MODULE__{})} | |
end | |
def handle_call({:configure, _options}, state) do | |
{:ok, :ok, state} | |
end | |
def handle_event({:info, _, _}, state), do: {:ok, state} | |
def handle_event({:error, _pid, {Logger, _, _timestamp, metadata}}, state) do | |
case Keyword.get(metadata, :crash_reason) do | |
nil -> | |
{:ok, state} | |
{crash_reason, stacktrace} -> | |
case Keyword.get(metadata, :ot_spans) do | |
nil -> | |
{:ok, state} | |
spans when is_list(spans) -> | |
:telemetry.execute([:my_app, :opentelemetry, :exit], %{}, %{spans: spans, reason: crash_reason, stacktrace: stacktrace}) | |
{:ok, state} | |
end | |
end | |
{:ok, state} | |
end | |
def handle_event({level, _gl, _thing} = _event, state) do | |
%{level: log_level} = state | |
if meet_level?(level, log_level) do | |
{:ok, state} | |
else | |
{:ok, state} | |
end | |
end | |
def handle_event(:flush, state) do | |
{:ok, flush(state)} | |
end | |
def handle_event(_msg, state) do | |
{:ok, state} | |
end | |
def handle_info({:io_reply, _ref, _msg}, state), do: {:ok, state} | |
def handle_info({:EXIT, _pid, _reason}, state), do: {:ok, state} | |
defp flush(state), do: state | |
defp meet_level?(_lvl, nil), do: true | |
defp meet_level?(lvl, min), do: Logger.compare_levels(lvl, min) != :lt | |
defp new(config, state) do | |
level = Keyword.get(config, :level, :error) | |
%{ | |
state | |
| level: level | |
} | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment