Skip to content

Instantly share code, notes, and snippets.

@Valian
Last active June 24, 2025 10:30
Show Gist options
  • Save Valian/19b49728ea390bae692bc0f4ea5d55e3 to your computer and use it in GitHub Desktop.
Save Valian/19b49728ea390bae692bc0f4ea5d55e3 to your computer and use it in GitHub Desktop.
a custom test reporter to ExUnit saving HTML report about used LLM calls during integration tests
defmodule Postline.IntegrationCase do
@moduledoc """
This module defines the setup for tests requiring
access to real external services like OpenAI.
Integration tests are meant to test actual integration with
third-party services and are not run by default with `mix test`.
Use `mix test.integration` to run integration tests.
These tests can incur costs as they make real API calls!
"""
use ExUnit.CaseTemplate
alias Ecto.Adapters.SQL.Sandbox
using do
quote do
# Import all the same test helpers as DataCase
# ...
import ExUnit.CaptureLog
import Postline.IntegrationCase
# Tag all integration tests
@moduletag :integration
def chat_completion(prompt_module, opts) do
Postline.IntegrationCase.chat_completion(__MODULE__, prompt_module, opts)
end
end
end
setup tags do
# Skip integration tests unless INTEGRATION_TEST=true
# DataCase has a reverse condition
if Application.get_env(:postline, Postline.IntegrationCase)[:enabled] != true do
raise "INTEGRATION_TEST can't be run unless INTEGRATION_TEST=true"
end
Postline.IntegrationCase.setup_sandbox(tags)
:ok
end
@doc """
Sets up the sandbox based on the test tags.
"""
def setup_sandbox(tags) do
pid = Sandbox.start_owner!(Postline.Repo, shared: not tags[:async])
on_exit(fn -> Sandbox.stop_owner(pid) end)
end
def llm_judge(test_module, question, {:ok, result}), do: llm_judge(test_module, question, result)
def llm_judge(test_module, question, %{"choices" => [%{"message" => msg}]}),
do: llm_judge(test_module, question, Jason.encode!(msg, pretty: true))
def llm_judge(test_module, question, output) do
inputs = %{question: question, output: output}
with {:ok, %{"choices" => [%{"message" => %{"content" => content}}]}} <-
chat_completion(test_module, Postline.Prompts.Library.LLMJudge, inputs: inputs),
{:ok, %{"pass" => pass, "reason" => reason, "score" => score}} <- Jason.decode(content) do
%{pass: pass, reason: reason, score: score}
else
{:error, error} -> throw("Invalid return value from LLM judge: #{inspect(error)}")
error -> throw("Invalid return value from LLM judge: #{inspect(error)}")
end
end
@doc """
In Postline, prompts are module-based.
Each such prompt has a format method, which takes inputs and some optional configuration
and returns a request to be sent to the LLM.
We're using OpenRouter under the hood. This is something you'll want to modify.
"""
def chat_completion(test_module, prompt_module, opts) do
# put your default templating inputs here
default_inputs = %{}
opts = Keyword.update(opts, :inputs, default_inputs, &Map.merge(default_inputs, &1))
# format prompt
with {:ok, request} <- prompt_module.format(opts),
{:ok, response} <- Postline.OpenRouter.chat_completion(request) do
Postline.TestReporter.report_llm_call(test_module, request, response)
{:ok, response}
end
end
@doc """
Asserts that an LLM response passes a specific evaluation criteria.
## Examples
# Simple assertion
assert_llm "Is it polite?", result
# With minimum score requirement
assert_llm "Is it polite?", result, min_score: 0.8
# With custom error message
assert_llm "Is it polite?", result, message: "Response must be polite"
## Options
* `:min_score` - Minimum score required to pass (default: 0.5)
* `:message` - Custom error message to display on failure
"""
defmacro assert_llm(question, output, opts \\ []) do
min_score = Keyword.get(opts, :min_score, 0.5)
custom_message = Keyword.get(opts, :message, nil)
quote do
result = llm_judge(__MODULE__, unquote(question), unquote(output))
question = unquote(question)
min_score = unquote(min_score)
custom_message = unquote(custom_message)
if !(result.pass && result.score >= min_score) do
message = """
LLM response failed evaluation:
Question: #{question}
Output: #{inspect(unquote(output))}
Pass: #{result.pass}
Score: #{result.score} (minimum: #{min_score})
Reason: #{result.reason}
"""
message = if custom_message, do: message <> "\n\n#{inspect(custom_message)}", else: message
flunk(message)
end
# Return the result for potential further assertions
result
end
end
end
defmodule Postline.Prompts.Library.LLMJudge do
@moduledoc false
# Import the SigilLiquid module to use the ~SYSTEM, ~USER, ~LIQUID sigils
use Postline.Prompts.Prompt
@response_format %{
"type" => "json_schema",
"json_schema" => %{
"name" => "results",
"schema" => %{
"type" => "object",
"required" => ["reason", "score", "pass"],
"properties" => %{
"reason" => %{
"type" => "string",
"description" => "Analysis of the rubric and the output"
},
"score" => %{
"type" => "number",
"description" => "Score between 0.0 and 1.0"
},
"pass" => %{
"type" => "boolean",
"description" => "Whether the output passes the evaluation"
}
}
}
}
}
@doc """
Returns the parameters for the prompt.
Valid parameters are:
- model: The model to use for the prompt.
- temperature: The temperature to use for the prompt.
- top_p: The top_p to use for the prompt.
- top_k: The top_k to use for the prompt.
- max_tokens: The max_tokens to use for the prompt.
For full list of parameters, see the OpenRouter API documentation:
https://openrouter.ai/docs/api-reference/parameters
"""
def params do
%{model: "google/gemini-2.0-flash-001", temperature: 0.0, response_format: @response_format}
end
@doc """
Returns the messages for the prompt.
"""
def messages do
[
~SYSTEM"""
You are an expert judge tasked with evaluating the quality of LLM-generated outputs against specific criteria.
Your job is to analyze the provided output and determine if it meets the requirements specified in the rubric. You must be fair, consistent, and thorough in your evaluation.
For each evaluation, you will:
1. Carefully analyze the output against the provided rubric
2. Provide a detailed explanation of your reasoning in the "reason" field
3. Assign a score between 0.0 (completely fails) and 1.0 (perfectly meets criteria)
4. Make a clear pass/fail determination
Below you will find:
1. The rubric with criteria for evaluation
2. The output that needs to be evaluated
RUBRIC:
{{ question }}
OUTPUT TO EVALUATE:
```
{{ output }}
```
Your response must be structured as a JSON object with the following fields:
- "reason": A detailed analysis explaining how the output meets or fails to meet the criteria
- "score": A number between 0.0 and 1.0 representing the quality of the output
- "pass": A boolean (true/false) indicating whether the output passes the evaluation
Be objective and focus solely on the criteria provided in the rubric. Do not introduce your own criteria or biases.
"""
]
end
@doc """
Returns the tools for the prompt.
"""
def tools do
[]
end
end
defmodule Postline.TestFormatter do
@moduledoc """
Formats test stats as HTML for static report generation.
"""
@doc """
Converts test state into formatted HTML.
"""
def format_html(state) do
# Calculate summary statistics
total_tests = Enum.count(state.tests)
completed_tests =
Enum.count(state.tests, fn {_key, test_data} ->
test_data.started && test_data.finished && test_data.error == nil
end)
partial_tests =
Enum.count(state.tests, fn {_key, test_data} ->
test_data.started && test_data.finished && test_data.error == nil && test_data.used_rescue
end)
failed_tests =
Enum.count(state.tests, fn {_key, test_data} ->
test_data.error != nil
end)
# All completed tests are passed (including partials)
passed_tests = completed_tests
# Fully passed tests are those without rescue
_fully_passed_tests = passed_tests - partial_tests
# Calculate duration statistics
durations =
Enum.flat_map(state.tests, fn {_key, test_data} ->
case test_data[:duration_ms] do
nil -> []
duration -> [duration]
end
end)
avg_duration =
if Enum.empty?(durations) do
0
else
round(Enum.sum(durations) / length(durations))
end
max_duration = if Enum.empty?(durations), do: 0, else: Enum.max(durations)
passed_percentage = if total_tests > 0, do: Float.round(passed_tests / total_tests * 100, 1), else: 0.0
partial_percentage = if total_tests > 0, do: Float.round(partial_tests / total_tests * 100, 1), else: 0.0
failed_percentage = if total_tests > 0, do: Float.round(failed_tests / total_tests * 100, 1), else: 0.0
"""
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Postline Test Report</title>
<style>
body {
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif;
line-height: 1.6;
color: #333;
max-width: 1200px;
margin: 0 auto;
padding: 20px;
background-color: #f8f9fa;
}
h1, h2, h3 {
color: #2d3748;
}
.header {
margin-bottom: 30px;
}
.timestamp {
color: #718096;
font-size: 14px;
margin-top: 5px;
margin-bottom: 4px;
}
.duration-stats {
color: #718096;
font-size: 14px;
margin-bottom: 2px;
}
.duration-stats .value {
font-weight: 600;
color: #4a5568;
}
.stats-container {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
gap: 20px;
margin: 25px 0;
}
.stat-card {
background-color: white;
border-radius: 8px;
padding: 15px 20px;
box-shadow: 0 1px 3px rgba(0,0,0,0.1);
cursor: pointer;
position: relative;
transition: transform 0.1s ease-in-out;
}
.stat-card:hover {
transform: translateY(-2px);
box-shadow: 0 2px 5px rgba(0,0,0,0.15);
}
.stat-card.active {
box-shadow: 0 0 0 2px #4299e1;
}
.stat-card .filter-text {
position: absolute;
top: 5px;
right: 10px;
font-size: 11px;
color: #718096;
font-weight: 500;
display: none;
}
.stat-card.active .filter-text {
display: block;
color: #4299e1;
}
.stat-card.passed {
border-left: 5px solid #38a169;
}
.stat-card.partial {
border-left: 5px solid #d69e2e;
}
.stat-card.failed {
border-left: 5px solid #e53e3e;
}
.stat-title {
font-size: 14px;
color: #718096;
margin-bottom: 5px;
}
.stat-value {
font-size: 22px;
font-weight: 600;
margin-bottom: 3px;
}
.stat-percentage {
font-size: 14px;
color: #718096;
}
table {
border-collapse: collapse;
width: 100%;
margin: 20px 0;
background-color: white;
border-radius: 8px;
overflow: hidden;
box-shadow: 0 1px 3px rgba(0,0,0,0.1);
table-layout: fixed;
}
th, td {
padding: 12px 16px;
border-bottom: 1px solid #e2e8f0;
vertical-align: middle;
white-space: nowrap;
}
th {
background-color: #f7fafc;
font-weight: 600;
color: #4a5568;
}
/* Column sizing */
th:first-child, td:first-child {
width: 70%;
text-align: left;
white-space: normal;
overflow: hidden;
text-overflow: ellipsis;
}
th:nth-child(2), td:nth-child(2) {
width: 10%;
text-align: right;
}
th:nth-child(3), td:nth-child(3) {
width: 10%;
text-align: center;
}
th:nth-child(4), td:nth-child(4) {
width: 10%;
text-align: right;
}
tr:last-child td {
border-bottom: none;
}
tr:hover {
background-color: #f7fafc;
}
.badge {
display: inline-block;
padding: 4px 8px;
border-radius: 4px;
font-size: 12px;
font-weight: 600;
}
.badge-success {
background-color: #c6f6d5;
color: #22543d;
}
.badge-partial {
background-color: #fefcbf;
color: #744210;
}
.badge-pending {
background-color: #e9d8fd;
color: #553c9a;
}
.badge-error {
background-color: #fed7d7;
color: #9b2c2c;
}
.badge-duration-fast {
background-color: #c6f6d5;
color: #22543d;
}
.badge-duration-medium {
background-color: #fefcbf;
color: #744210;
}
.badge-duration-slow {
background-color: #fed7d7;
color: #9b2c2c;
}
.badge-duration-very-slow {
background-color: #2d3748;
color: #ffffff;
}
a {
color: #4299e1;
text-decoration: none;
}
a:hover {
text-decoration: underline;
}
.filter-controls {
display: flex;
justify-content: flex-end;
margin-bottom: 10px;
align-items: center;
}
.filter-controls label {
margin-right: 8px;
font-size: 14px;
color: #4a5568;
}
.filter-controls button {
background-color: #edf2f7;
border: none;
padding: 6px 12px;
margin-left: 5px;
border-radius: 4px;
font-size: 12px;
font-weight: 500;
color: #4a5568;
cursor: pointer;
transition: all 0.2s;
}
.filter-controls button:hover {
background-color: #e2e8f0;
}
.filter-controls button.active {
background-color: #4299e1;
color: white;
}
tr.hide {
display: none;
}
.text-right {
text-align: right;
}
.text-center {
text-align: center;
}
</style>
</head>
<body>
<div class="header">
<h1>Postline Test Report</h1>
<div class="duration-stats">Time: <span class="value">#{format_berlin_time()}</span></div>
<div class="duration-stats">Average: <span class="value">#{avg_duration}ms</span></div>
<div class="duration-stats">Longest: <span class="value">#{max_duration}ms</span></div>
</div>
<div class="stats-container">
<div class="stat-card passed" data-filter="success" onclick="filterTests('success')">
<span class="filter-text">Show Only</span>
<div class="stat-title">Passed</div>
<div class="stat-value">#{passed_tests}/#{total_tests}</div>
<div class="stat-percentage">#{passed_percentage}%</div>
</div>
<div class="stat-card partial" data-filter="partial" onclick="filterTests('partial')">
<span class="filter-text">Show Only</span>
<div class="stat-title">Partial</div>
<div class="stat-value">#{partial_tests}/#{total_tests}</div>
<div class="stat-percentage">#{partial_percentage}%</div>
</div>
<div class="stat-card failed" data-filter="error" onclick="filterTests('error')">
<span class="filter-text">Show Only</span>
<div class="stat-title">Failed</div>
<div class="stat-value">#{failed_tests}/#{total_tests}</div>
<div class="stat-percentage">#{failed_percentage}%</div>
</div>
</div>
<div class="filter-controls">
<label>Filter:</label>
<button id="filter-all" class="active" onclick="filterTests('all')">All Tests</button>
<button id="filter-success" onclick="filterTests('success')">Passed</button>
<button id="filter-partial" onclick="filterTests('partial')">Partial</button>
<button id="filter-error" onclick="filterTests('error')">Failed</button>
</div>
<table>
<thead>
<tr>
<th>Test</th>
<th>Status</th>
<th class="text-center">Duration</th>
<th>View</th>
</tr>
</thead>
<tbody>
#{render_tests(state.tests)}
</tbody>
</table>
<script>
function filterTests(filterType) {
// Reset all filters
document.querySelectorAll('.filter-controls button').forEach(btn => {
btn.classList.remove('active');
});
document.querySelectorAll('.stat-card').forEach(card => {
card.classList.remove('active');
});
// Set the active button
document.getElementById('filter-' + filterType).classList.add('active');
// If it's not 'all', set the active card
if (filterType !== 'all') {
document.querySelector('.stat-card[data-filter="' + filterType + '"]').classList.add('active');
}
// Filter the table rows
const rows = document.querySelectorAll('tbody tr');
rows.forEach(row => {
const statusBadge = row.querySelector('.badge');
if (!statusBadge) return; // Skip error rows
if (filterType === 'all') {
row.classList.remove('hide');
} else {
// Check if the badge matches the filter
const hasClass = statusBadge.classList.contains('badge-' + filterType);
if (hasClass) {
row.classList.remove('hide');
} else {
row.classList.add('hide');
}
}
});
}
// Initialize with all tests showing
window.onload = function() {
filterTests('all');
};
</script>
</body>
</html>
"""
end
defp render_tests(tests) do
Enum.map_join(tests, "", fn {{_module, name}, test_data} ->
# Remove "test " from the beginning of the test name
# First, ensure name is a string before trying to replace
display_name =
name
|> to_string()
|> String.replace(~r/^test /, "")
status_badge =
case test_data do
%{error: error} when error != nil ->
~s(<span class="badge badge-error">Error</span>)
%{started: true, finished: true, used_rescue: true} ->
~s(<span class="badge badge-partial">Partial</span>)
%{started: true, finished: true} ->
~s(<span class="badge badge-success">Passed</span>)
%{started: true, finished: false} ->
~s(<span class="badge badge-pending">Running</span>)
end
duration_badge = create_duration_badge(test_data[:duration_ms])
llm_link =
case test_data[:llm_report_link] do
nil ->
""
link ->
llm_count = Enum.count(test_data.llm_calls)
~s(<a href="#{link}">View \(#{llm_count}\)</a>)
end
error_row =
case test_data.error do
nil ->
""
error ->
"""
<tr>
<td colspan="4" class="badge-error" style="white-space: pre-wrap;">#{error}</td>
</tr>
"""
end
"""
<tr>
<td>#{display_name}</td>
<td>#{status_badge}</td>
<td class="text-center">#{duration_badge}</td>
<td>#{llm_link}</td>
</tr>
#{error_row}
"""
end)
end
defp create_duration_badge(nil), do: "N/A"
defp create_duration_badge(duration_ms) do
{badge_class, badge_text} =
cond do
duration_ms <= 3000 -> {"badge-duration-fast", "#{duration_ms}ms"}
duration_ms <= 5000 -> {"badge-duration-medium", "#{duration_ms}ms"}
duration_ms <= 8000 -> {"badge-duration-slow", "#{duration_ms}ms"}
true -> {"badge-duration-very-slow", "#{duration_ms}ms"}
end
~s(<span class="badge #{badge_class}">#{badge_text}</span>)
end
defp format_berlin_time do
# Get current UTC time
utc_now = DateTime.utc_now()
# Determine if we're in DST (roughly - actual calculation is more complex)
# This is a simple approximation for Berlin time
month = utc_now.month
# CEST (+2 hours)
dst_offset =
if month in 3..10 do
2
else
# CET (+1 hour)
1
end
# Add the Berlin offset to UTC
berlin_time = DateTime.add(utc_now, dst_offset, :hour)
# Format as HH:MM
Calendar.strftime(berlin_time, "%H:%M")
end
@doc """
Generates HTML content for a single test's LLM calls.
"""
def format_llm_calls_html(module, name, llm_calls) do
# Reverse calls to show oldest first
calls_content =
llm_calls
|> Enum.reverse()
|> Enum.with_index(1)
|> Enum.map_join(&render_single_llm_call/1)
"""
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>LLM Calls - #{inspect(module)} - #{inspect(name)}</title>
<style>
body {
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif;
line-height: 1.6;
color: #333;
max-width: 1200px;
margin: 0 auto;
padding: 20px;
}
h1, h2 {
color: #2d3748;
}
.llm-call {
border: 1px solid #e2e8f0;
border-radius: 8px;
margin-bottom: 20px;
background-color: #fff; /* Change background */
}
.llm-call-header {
background-color: #edf2f7;
padding: 10px 15px;
font-weight: 600;
border-bottom: 1px solid #e2e8f0;
border-top-left-radius: 8px;
border-top-right-radius: 8px;
display: flex; /* Use flex for alignment */
justify-content: space-between; /* Space out content */
align-items: center; /* Center items vertically */
}
.llm-call-body {
padding: 0 15px 15px 15px; /* Adjust padding */
}
.llm-request-section {
/* Hide request by default */
/* max-height: 0;
overflow: hidden; */
/* transition: max-height 0.3s ease-out; */
/* Let's keep it visible initially and make it collapsible */
padding-top: 15px;
border-top: 1px dashed #e2e8f0;
margin-top: 15px;
}
/* Style for when request is hidden - Add class via JS */
.llm-request-section.hidden {
display: none;
}
.message {
border: 1px solid #cbd5e0;
border-radius: 6px;
margin-bottom: 10px;
background-color: #f7fafc; /* Background for messages */
}
.message-header {
background-color: #e2e8f0;
padding: 5px 10px;
font-weight: 500;
text-transform: capitalize;
border-bottom: 1px solid #cbd5e0;
border-top-left-radius: 6px;
border-top-right-radius: 6px;
}
.message-content {
padding: 10px;
white-space: pre-wrap; /* Preserve whitespace and wrap */
word-wrap: break-word;
}
.reasoning-content {
padding: 10px;
white-space: pre-wrap;
word-wrap: break-word;
font-size: 0.85em; /* Smaller font for reasoning */
color: #4a5568; /* Slightly darker text for reasoning */
background-color: #f0f4f8; /* Slightly different background */
border-bottom: 1px solid #e2e8f0;
}
.tool-call {
border: 1px dashed #a0aec0;
border-radius: 4px;
padding: 10px;
margin: 10px;
background-color: #f0f4f8;
}
pre {
background-color: #2d3748;
color: #f7fafc;
padding: 15px;
border-radius: 6px;
overflow-x: auto;
}
code {
font-family: "SFMono-Regular", Consolas, "Liberation Mono", Menlo, Courier, monospace;
}
.toggle-button {
background-color: #4a5568;
color: white;
border: none;
padding: 5px 10px;
border-radius: 4px;
cursor: pointer;
font-size: 12px;
}
.toggle-button:hover {
background-color: #2d3748;
}
</style>
</head>
<body>
<h1>LLM Calls</h1>
<h2>#{module}</h2>
<h3>#{name}</h3>
<a href="../test_report.html">&larr; Back to Main Report</a>
#{calls_content}
<script>
document.querySelectorAll('.toggle-button').forEach(button => {
button.addEventListener('click', function() {
const callIndex = this.getAttribute('data-call-index');
const requestSection = document.getElementById(`request-${callIndex}`);
if (requestSection) {
requestSection.classList.toggle('hidden');
// Change button text
if (requestSection.classList.contains('hidden')) {
this.textContent = 'Show Request';
} else {
this.textContent = 'Hide Request';
}
}
});
});
</script>
</body>
</html>
"""
end
defp render_single_llm_call({llm_call, index}) do
request_content = format_request(llm_call.request)
response_content = format_response(llm_call.response)
timestamp_str = llm_call.timestamp |> DateTime.truncate(:second) |> DateTime.to_string()
request_section_id = "request-#{index}"
"""
<div class="llm-call">
<div class="llm-call-header">
<span>LLM Call ##{index} (#{timestamp_str})</span>
<button class="toggle-button" data-call-index="#{index}">Show Request</button>
</div>
<div class="llm-call-body">
<div id="#{request_section_id}" class="llm-request-section hidden">
<h3>Request</h3>
#{request_content}
</div>
<div>
<h3>Response</h3>
#{response_content}
</div>
</div>
</div>
"""
end
defp format_request(request) do
# Assuming request has a :messages key similar to OpenAI
messages = request[:messages] || request["messages"] || []
Enum.map_join(messages, &format_message/1)
end
defp format_response(response) do
# Assuming response format similar to OpenAI ChatCompletion
choices = response[:choices] || response["choices"] || []
Enum.map_join(choices, fn choice ->
message = choice[:message] || choice["message"]
finish_reason = choice[:finish_reason] || choice["finish_reason"]
message_html = format_message(message)
"""
#{message_html}
<p><em>Finish Reason: #{finish_reason || "N/A"}</em></p>
"""
end)
end
defp format_message(message) when is_map(message) do
role = Map.get(message, "role")
content = Map.get(message, "content")
reasoning = Map.get(message, "reasoning")
tool_calls = Map.get(message, "tool_calls")
content_html = ""
# Add reasoning with different styling if present
content_html =
if is_binary(reasoning) and String.trim(reasoning) != "" do
content_html <> ~s(<div class="reasoning-content">#{Plug.HTML.html_escape(reasoning)}</div>)
else
content_html
end
# Add content with regular styling if present
content_html =
if is_binary(content) and String.trim(content) != "" do
content_html <> ~s(<div class="message-content">#{Plug.HTML.html_escape(content)}</div>)
else
content_html <> ~s(<div class="message-content"><em>No text content</em></div>)
end
content_html =
if is_list(tool_calls) and not Enum.empty?(tool_calls) do
content_html <> format_tool_calls(tool_calls)
else
content_html
end
"""
<div class="message">
<div class="message-header">Role: #{role}</div>
#{content_html}
</div>
"""
end
defp format_message(_), do: "<p><em>Invalid message format</em></p>"
defp format_tool_calls(tool_calls) when is_list(tool_calls) do
Enum.map_join(tool_calls, fn tool_call ->
# Use Map.get with atom/string flexibility
id = Map.get(tool_call, "id")
type = Map.get(tool_call, "type")
function_map = Map.get(tool_call, "function")
name = Map.get(function_map, "name")
arguments = Map.get(function_map, "arguments")
# Attempt to pretty-print arguments if they look like JSON
formatted_args =
case Jason.decode(arguments) do
{:ok, decoded_args} ->
decoded_args
|> Jason.encode!(pretty: true)
|> Plug.HTML.html_escape()
_ ->
Plug.HTML.html_escape(arguments)
end
"""
<div class="tool-call">
<strong>Tool Call ID:</strong> #{id}<br>
<strong>Type:</strong> #{type}<br>
<strong>Function Name:</strong> #{name}<br>
<strong>Arguments:</strong>
<pre><code>#{formatted_args}</code></pre>
</div>
"""
end)
end
defp format_tool_calls(_), do: "<em>Invalid tool calls format</em>"
end
integration_test_config = Application.get_env(:postline, Postline.IntegrationCase)
opts =
if integration_test_config[:enabled],
do: [formatters: [ExUnit.CLIFormatter, Postline.TestReporter]],
else: []
ExUnit.start(opts)
Ecto.Adapters.SQL.Sandbox.mode(Postline.Repo, :manual)
defmodule Postline.TestReporter do
@moduledoc false
use GenServer
require Logger
# Client API
def start_link(opts \\ []) do
GenServer.start_link(__MODULE__, opts, name: __MODULE__)
end
@doc """
Reports an LLM generation call made during a test.
Determines the current test based on the provided module.
Requires the module, the request, and the response.
"""
def report_llm_call(module, request, response) do
GenServer.cast(__MODULE__, {:llm_generation, module, request, response})
end
@doc """
Marks the current test as having used a rescue block. This will be used
in the test reporter to show the test as "partial" rather than "passed".
"""
def mark_test_rescue_used(module) do
GenServer.cast(__MODULE__, {:mark_rescue_used, module})
end
# Server Callbacks
@impl true
def init(_opts) do
state = %{
# Structure: {module, name} => %{started: bool, finished: bool, llm_calls: list}
tests: %{}
}
Process.register(self(), __MODULE__)
{:ok, state}
end
@impl true
def handle_cast({:suite_finished, _times_used}, state) do
# Generate HTML report and save it to a file
save_html_report(state)
{:noreply, state}
end
@impl true
def handle_cast({event, %{state: {:excluded, _reason}}}, state) when event in [:test_started, :test_finished] do
{:noreply, state}
end
@impl true
def handle_cast({:test_started, test}, state) do
test_key = {test.module, test.name}
# Initialize test state with current timestamp
start_time = System.monotonic_time()
new_state =
put_in(state, [:tests, test_key], %{
started: true,
finished: false,
error: nil,
llm_calls: [],
used_rescue: false,
start_time: start_time
})
{:noreply, new_state}
end
@impl true
def handle_cast({:test_finished, test}, state) do
test_key = {test.module, test.name}
end_time = System.monotonic_time()
# Calculate duration in milliseconds
start_time = get_in(state, [:tests, test_key, :start_time]) || end_time
duration_ms = System.convert_time_unit(end_time - start_time, :native, :millisecond)
# Print result to terminal
test_name = "#{inspect(test.module)} #{test.name}"
status =
case test.state do
{:failed, _} -> IO.ANSI.red() <> "FAILED" <> IO.ANSI.reset()
_ -> IO.ANSI.green() <> "PASSED" <> IO.ANSI.reset()
end
# Only print for completed tests to avoid interference with ExUnit's own output
if test.state != nil do
IO.puts("#{test_name} (#{duration_ms}ms): #{status}")
end
# Mark test as finished
error =
case test.state do
{:failed, [{:error, error, stacktrace}]} ->
file_link =
case stacktrace do
[{_mod, _func, _arity, [file: file, line: line]}] ->
"<a href=\"cursor://file/#{Path.absname(file)}:#{line}\">#{file}:#{line}</a>"
_ ->
""
end
message =
case error do
%ExUnit.AssertionError{left: left, right: right} when left != :ex_unit_no_meaningful_value ->
"Expected to be equal:<br>Left: #{inspect(left)}<br>Right: #{inspect(right)}"
%{message: message} when is_binary(message) ->
message
error ->
inspect(error)
end
file_link <> "<br>" <> message
{:failed, _} ->
"Unknown error"
_ ->
nil
end
new_state =
update_in(
state,
[:tests, test_key],
&(&1
|> Map.put(:finished, true)
|> Map.put(:error, error)
|> Map.put(:duration_ms, duration_ms))
)
{:noreply, new_state}
end
@impl true
def handle_cast({:mark_rescue_used, module}, state) do
# Find the currently running test for the given module
test_key =
Enum.find_value(state.tests, fn
{{^module, test_name}, %{started: true, finished: false}} -> {module, test_name}
_ -> nil
end)
case test_key do
nil ->
# No running test found for this module
Logger.error(
"Attempted to mark rescue used for module #{inspect(module)}, but no test is currently running for it."
)
{:noreply, state}
test_key ->
# Found the running test, mark it as having used a rescue block
new_state = put_in(state, [:tests, test_key, :used_rescue], true)
{:noreply, new_state}
end
end
@impl true
def handle_cast({:llm_generation, module, request, response}, state) do
# Find the currently running test for the given module
test_key =
Enum.find_value(state.tests, fn
{{^module, test_name}, %{started: true, finished: false}} -> {module, test_name}
_ -> nil
end)
case test_key do
nil ->
# No running test found for this module
Logger.error(
"Attempted to report LLM call for module #{inspect(module)}, but no test is currently running for it."
)
{:noreply, state}
test_key ->
# Found the running test, append the LLM call
llm_call_data = %{request: request, response: response, timestamp: DateTime.utc_now()}
new_state = update_in(state, [:tests, test_key, :llm_calls], fn calls -> [llm_call_data | calls] end)
{:noreply, new_state}
end
end
# catch all other messages
def handle_cast(_, state), do: {:noreply, state}
# Private functions
defp save_html_report(state) do
# Create reports directories if they don't exist
base_reports_dir = "reports"
llm_calls_dir = Path.join(base_reports_dir, "llm_calls")
File.mkdir_p!(base_reports_dir)
File.mkdir_p!(llm_calls_dir)
# Generate individual LLM call reports and collect links
tests_with_links =
Map.new(state.tests, fn {{module, name} = test_key, test_data} ->
if Enum.empty?(test_data.llm_calls) do
{test_key, test_data}
else
# Generate a unique, filesystem-safe filename
sanitized_module = module |> Module.split() |> Enum.join("_")
sanitized_name = name |> Atom.to_string() |> String.replace(~r/[^\w\-]+/, "_")
report_filename = "test_#{sanitized_module}_#{sanitized_name}.html"
report_filepath = Path.join(llm_calls_dir, report_filename)
# Link relative to main report
relative_link = Path.join("llm_calls", report_filename)
# Generate and save individual report
llm_html = Postline.TestFormatter.format_llm_calls_html(module, name, test_data.llm_calls)
File.write!(report_filepath, llm_html)
# Add link to test data for the main formatter
updated_test_data = Map.put(test_data, :llm_report_link, relative_link)
{test_key, updated_test_data}
end
end)
# Prepare state with updated test data (including links)
updated_state = %{state | tests: tests_with_links}
# Generate main report filename
main_report_filename = Path.join(base_reports_dir, "test_report.html")
# Generate main HTML content using updated state
main_html_content = Postline.TestFormatter.format_html(updated_state)
# Write main report to file
File.write!(main_report_filename, main_html_content)
IO.puts("Test report saved to file://#{Path.absname(main_report_filename)}")
# Note: Individual LLM reports are in #{llm_calls_dir}
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment