Last active
June 24, 2025 10:30
-
-
Save Valian/19b49728ea390bae692bc0f4ea5d55e3 to your computer and use it in GitHub Desktop.
a custom test reporter to ExUnit saving HTML report about used LLM calls during integration tests
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| defmodule Postline.IntegrationCase do | |
| @moduledoc """ | |
| This module defines the setup for tests requiring | |
| access to real external services like OpenAI. | |
| Integration tests are meant to test actual integration with | |
| third-party services and are not run by default with `mix test`. | |
| Use `mix test.integration` to run integration tests. | |
| These tests can incur costs as they make real API calls! | |
| """ | |
| use ExUnit.CaseTemplate | |
| alias Ecto.Adapters.SQL.Sandbox | |
| using do | |
| quote do | |
| # Import all the same test helpers as DataCase | |
| # ... | |
| import ExUnit.CaptureLog | |
| import Postline.IntegrationCase | |
| # Tag all integration tests | |
| @moduletag :integration | |
| def chat_completion(prompt_module, opts) do | |
| Postline.IntegrationCase.chat_completion(__MODULE__, prompt_module, opts) | |
| end | |
| end | |
| end | |
| setup tags do | |
| # Skip integration tests unless INTEGRATION_TEST=true | |
| # DataCase has a reverse condition | |
| if Application.get_env(:postline, Postline.IntegrationCase)[:enabled] != true do | |
| raise "INTEGRATION_TEST can't be run unless INTEGRATION_TEST=true" | |
| end | |
| Postline.IntegrationCase.setup_sandbox(tags) | |
| :ok | |
| end | |
| @doc """ | |
| Sets up the sandbox based on the test tags. | |
| """ | |
| def setup_sandbox(tags) do | |
| pid = Sandbox.start_owner!(Postline.Repo, shared: not tags[:async]) | |
| on_exit(fn -> Sandbox.stop_owner(pid) end) | |
| end | |
| def llm_judge(test_module, question, {:ok, result}), do: llm_judge(test_module, question, result) | |
| def llm_judge(test_module, question, %{"choices" => [%{"message" => msg}]}), | |
| do: llm_judge(test_module, question, Jason.encode!(msg, pretty: true)) | |
| def llm_judge(test_module, question, output) do | |
| inputs = %{question: question, output: output} | |
| with {:ok, %{"choices" => [%{"message" => %{"content" => content}}]}} <- | |
| chat_completion(test_module, Postline.Prompts.Library.LLMJudge, inputs: inputs), | |
| {:ok, %{"pass" => pass, "reason" => reason, "score" => score}} <- Jason.decode(content) do | |
| %{pass: pass, reason: reason, score: score} | |
| else | |
| {:error, error} -> throw("Invalid return value from LLM judge: #{inspect(error)}") | |
| error -> throw("Invalid return value from LLM judge: #{inspect(error)}") | |
| end | |
| end | |
| @doc """ | |
| In Postline, prompts are module-based. | |
| Each such prompt has a format method, which takes inputs and some optional configuration | |
| and returns a request to be sent to the LLM. | |
| We're using OpenRouter under the hood. This is something you'll want to modify. | |
| """ | |
| def chat_completion(test_module, prompt_module, opts) do | |
| # put your default templating inputs here | |
| default_inputs = %{} | |
| opts = Keyword.update(opts, :inputs, default_inputs, &Map.merge(default_inputs, &1)) | |
| # format prompt | |
| with {:ok, request} <- prompt_module.format(opts), | |
| {:ok, response} <- Postline.OpenRouter.chat_completion(request) do | |
| Postline.TestReporter.report_llm_call(test_module, request, response) | |
| {:ok, response} | |
| end | |
| end | |
| @doc """ | |
| Asserts that an LLM response passes a specific evaluation criteria. | |
| ## Examples | |
| # Simple assertion | |
| assert_llm "Is it polite?", result | |
| # With minimum score requirement | |
| assert_llm "Is it polite?", result, min_score: 0.8 | |
| # With custom error message | |
| assert_llm "Is it polite?", result, message: "Response must be polite" | |
| ## Options | |
| * `:min_score` - Minimum score required to pass (default: 0.5) | |
| * `:message` - Custom error message to display on failure | |
| """ | |
| defmacro assert_llm(question, output, opts \\ []) do | |
| min_score = Keyword.get(opts, :min_score, 0.5) | |
| custom_message = Keyword.get(opts, :message, nil) | |
| quote do | |
| result = llm_judge(__MODULE__, unquote(question), unquote(output)) | |
| question = unquote(question) | |
| min_score = unquote(min_score) | |
| custom_message = unquote(custom_message) | |
| if !(result.pass && result.score >= min_score) do | |
| message = """ | |
| LLM response failed evaluation: | |
| Question: #{question} | |
| Output: #{inspect(unquote(output))} | |
| Pass: #{result.pass} | |
| Score: #{result.score} (minimum: #{min_score}) | |
| Reason: #{result.reason} | |
| """ | |
| message = if custom_message, do: message <> "\n\n#{inspect(custom_message)}", else: message | |
| flunk(message) | |
| end | |
| # Return the result for potential further assertions | |
| result | |
| end | |
| end | |
| end |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| defmodule Postline.Prompts.Library.LLMJudge do | |
| @moduledoc false | |
| # Import the SigilLiquid module to use the ~SYSTEM, ~USER, ~LIQUID sigils | |
| use Postline.Prompts.Prompt | |
| @response_format %{ | |
| "type" => "json_schema", | |
| "json_schema" => %{ | |
| "name" => "results", | |
| "schema" => %{ | |
| "type" => "object", | |
| "required" => ["reason", "score", "pass"], | |
| "properties" => %{ | |
| "reason" => %{ | |
| "type" => "string", | |
| "description" => "Analysis of the rubric and the output" | |
| }, | |
| "score" => %{ | |
| "type" => "number", | |
| "description" => "Score between 0.0 and 1.0" | |
| }, | |
| "pass" => %{ | |
| "type" => "boolean", | |
| "description" => "Whether the output passes the evaluation" | |
| } | |
| } | |
| } | |
| } | |
| } | |
| @doc """ | |
| Returns the parameters for the prompt. | |
| Valid parameters are: | |
| - model: The model to use for the prompt. | |
| - temperature: The temperature to use for the prompt. | |
| - top_p: The top_p to use for the prompt. | |
| - top_k: The top_k to use for the prompt. | |
| - max_tokens: The max_tokens to use for the prompt. | |
| For full list of parameters, see the OpenRouter API documentation: | |
| https://openrouter.ai/docs/api-reference/parameters | |
| """ | |
| def params do | |
| %{model: "google/gemini-2.0-flash-001", temperature: 0.0, response_format: @response_format} | |
| end | |
| @doc """ | |
| Returns the messages for the prompt. | |
| """ | |
| def messages do | |
| [ | |
| ~SYSTEM""" | |
| You are an expert judge tasked with evaluating the quality of LLM-generated outputs against specific criteria. | |
| Your job is to analyze the provided output and determine if it meets the requirements specified in the rubric. You must be fair, consistent, and thorough in your evaluation. | |
| For each evaluation, you will: | |
| 1. Carefully analyze the output against the provided rubric | |
| 2. Provide a detailed explanation of your reasoning in the "reason" field | |
| 3. Assign a score between 0.0 (completely fails) and 1.0 (perfectly meets criteria) | |
| 4. Make a clear pass/fail determination | |
| Below you will find: | |
| 1. The rubric with criteria for evaluation | |
| 2. The output that needs to be evaluated | |
| RUBRIC: | |
| {{ question }} | |
| OUTPUT TO EVALUATE: | |
| ``` | |
| {{ output }} | |
| ``` | |
| Your response must be structured as a JSON object with the following fields: | |
| - "reason": A detailed analysis explaining how the output meets or fails to meet the criteria | |
| - "score": A number between 0.0 and 1.0 representing the quality of the output | |
| - "pass": A boolean (true/false) indicating whether the output passes the evaluation | |
| Be objective and focus solely on the criteria provided in the rubric. Do not introduce your own criteria or biases. | |
| """ | |
| ] | |
| end | |
| @doc """ | |
| Returns the tools for the prompt. | |
| """ | |
| def tools do | |
| [] | |
| end | |
| end |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| defmodule Postline.TestFormatter do | |
| @moduledoc """ | |
| Formats test stats as HTML for static report generation. | |
| """ | |
| @doc """ | |
| Converts test state into formatted HTML. | |
| """ | |
| def format_html(state) do | |
| # Calculate summary statistics | |
| total_tests = Enum.count(state.tests) | |
| completed_tests = | |
| Enum.count(state.tests, fn {_key, test_data} -> | |
| test_data.started && test_data.finished && test_data.error == nil | |
| end) | |
| partial_tests = | |
| Enum.count(state.tests, fn {_key, test_data} -> | |
| test_data.started && test_data.finished && test_data.error == nil && test_data.used_rescue | |
| end) | |
| failed_tests = | |
| Enum.count(state.tests, fn {_key, test_data} -> | |
| test_data.error != nil | |
| end) | |
| # All completed tests are passed (including partials) | |
| passed_tests = completed_tests | |
| # Fully passed tests are those without rescue | |
| _fully_passed_tests = passed_tests - partial_tests | |
| # Calculate duration statistics | |
| durations = | |
| Enum.flat_map(state.tests, fn {_key, test_data} -> | |
| case test_data[:duration_ms] do | |
| nil -> [] | |
| duration -> [duration] | |
| end | |
| end) | |
| avg_duration = | |
| if Enum.empty?(durations) do | |
| 0 | |
| else | |
| round(Enum.sum(durations) / length(durations)) | |
| end | |
| max_duration = if Enum.empty?(durations), do: 0, else: Enum.max(durations) | |
| passed_percentage = if total_tests > 0, do: Float.round(passed_tests / total_tests * 100, 1), else: 0.0 | |
| partial_percentage = if total_tests > 0, do: Float.round(partial_tests / total_tests * 100, 1), else: 0.0 | |
| failed_percentage = if total_tests > 0, do: Float.round(failed_tests / total_tests * 100, 1), else: 0.0 | |
| """ | |
| <!DOCTYPE html> | |
| <html> | |
| <head> | |
| <meta charset="UTF-8"> | |
| <meta name="viewport" content="width=device-width, initial-scale=1.0"> | |
| <title>Postline Test Report</title> | |
| <style> | |
| body { | |
| font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif; | |
| line-height: 1.6; | |
| color: #333; | |
| max-width: 1200px; | |
| margin: 0 auto; | |
| padding: 20px; | |
| background-color: #f8f9fa; | |
| } | |
| h1, h2, h3 { | |
| color: #2d3748; | |
| } | |
| .header { | |
| margin-bottom: 30px; | |
| } | |
| .timestamp { | |
| color: #718096; | |
| font-size: 14px; | |
| margin-top: 5px; | |
| margin-bottom: 4px; | |
| } | |
| .duration-stats { | |
| color: #718096; | |
| font-size: 14px; | |
| margin-bottom: 2px; | |
| } | |
| .duration-stats .value { | |
| font-weight: 600; | |
| color: #4a5568; | |
| } | |
| .stats-container { | |
| display: grid; | |
| grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); | |
| gap: 20px; | |
| margin: 25px 0; | |
| } | |
| .stat-card { | |
| background-color: white; | |
| border-radius: 8px; | |
| padding: 15px 20px; | |
| box-shadow: 0 1px 3px rgba(0,0,0,0.1); | |
| cursor: pointer; | |
| position: relative; | |
| transition: transform 0.1s ease-in-out; | |
| } | |
| .stat-card:hover { | |
| transform: translateY(-2px); | |
| box-shadow: 0 2px 5px rgba(0,0,0,0.15); | |
| } | |
| .stat-card.active { | |
| box-shadow: 0 0 0 2px #4299e1; | |
| } | |
| .stat-card .filter-text { | |
| position: absolute; | |
| top: 5px; | |
| right: 10px; | |
| font-size: 11px; | |
| color: #718096; | |
| font-weight: 500; | |
| display: none; | |
| } | |
| .stat-card.active .filter-text { | |
| display: block; | |
| color: #4299e1; | |
| } | |
| .stat-card.passed { | |
| border-left: 5px solid #38a169; | |
| } | |
| .stat-card.partial { | |
| border-left: 5px solid #d69e2e; | |
| } | |
| .stat-card.failed { | |
| border-left: 5px solid #e53e3e; | |
| } | |
| .stat-title { | |
| font-size: 14px; | |
| color: #718096; | |
| margin-bottom: 5px; | |
| } | |
| .stat-value { | |
| font-size: 22px; | |
| font-weight: 600; | |
| margin-bottom: 3px; | |
| } | |
| .stat-percentage { | |
| font-size: 14px; | |
| color: #718096; | |
| } | |
| table { | |
| border-collapse: collapse; | |
| width: 100%; | |
| margin: 20px 0; | |
| background-color: white; | |
| border-radius: 8px; | |
| overflow: hidden; | |
| box-shadow: 0 1px 3px rgba(0,0,0,0.1); | |
| table-layout: fixed; | |
| } | |
| th, td { | |
| padding: 12px 16px; | |
| border-bottom: 1px solid #e2e8f0; | |
| vertical-align: middle; | |
| white-space: nowrap; | |
| } | |
| th { | |
| background-color: #f7fafc; | |
| font-weight: 600; | |
| color: #4a5568; | |
| } | |
| /* Column sizing */ | |
| th:first-child, td:first-child { | |
| width: 70%; | |
| text-align: left; | |
| white-space: normal; | |
| overflow: hidden; | |
| text-overflow: ellipsis; | |
| } | |
| th:nth-child(2), td:nth-child(2) { | |
| width: 10%; | |
| text-align: right; | |
| } | |
| th:nth-child(3), td:nth-child(3) { | |
| width: 10%; | |
| text-align: center; | |
| } | |
| th:nth-child(4), td:nth-child(4) { | |
| width: 10%; | |
| text-align: right; | |
| } | |
| tr:last-child td { | |
| border-bottom: none; | |
| } | |
| tr:hover { | |
| background-color: #f7fafc; | |
| } | |
| .badge { | |
| display: inline-block; | |
| padding: 4px 8px; | |
| border-radius: 4px; | |
| font-size: 12px; | |
| font-weight: 600; | |
| } | |
| .badge-success { | |
| background-color: #c6f6d5; | |
| color: #22543d; | |
| } | |
| .badge-partial { | |
| background-color: #fefcbf; | |
| color: #744210; | |
| } | |
| .badge-pending { | |
| background-color: #e9d8fd; | |
| color: #553c9a; | |
| } | |
| .badge-error { | |
| background-color: #fed7d7; | |
| color: #9b2c2c; | |
| } | |
| .badge-duration-fast { | |
| background-color: #c6f6d5; | |
| color: #22543d; | |
| } | |
| .badge-duration-medium { | |
| background-color: #fefcbf; | |
| color: #744210; | |
| } | |
| .badge-duration-slow { | |
| background-color: #fed7d7; | |
| color: #9b2c2c; | |
| } | |
| .badge-duration-very-slow { | |
| background-color: #2d3748; | |
| color: #ffffff; | |
| } | |
| a { | |
| color: #4299e1; | |
| text-decoration: none; | |
| } | |
| a:hover { | |
| text-decoration: underline; | |
| } | |
| .filter-controls { | |
| display: flex; | |
| justify-content: flex-end; | |
| margin-bottom: 10px; | |
| align-items: center; | |
| } | |
| .filter-controls label { | |
| margin-right: 8px; | |
| font-size: 14px; | |
| color: #4a5568; | |
| } | |
| .filter-controls button { | |
| background-color: #edf2f7; | |
| border: none; | |
| padding: 6px 12px; | |
| margin-left: 5px; | |
| border-radius: 4px; | |
| font-size: 12px; | |
| font-weight: 500; | |
| color: #4a5568; | |
| cursor: pointer; | |
| transition: all 0.2s; | |
| } | |
| .filter-controls button:hover { | |
| background-color: #e2e8f0; | |
| } | |
| .filter-controls button.active { | |
| background-color: #4299e1; | |
| color: white; | |
| } | |
| tr.hide { | |
| display: none; | |
| } | |
| .text-right { | |
| text-align: right; | |
| } | |
| .text-center { | |
| text-align: center; | |
| } | |
| </style> | |
| </head> | |
| <body> | |
| <div class="header"> | |
| <h1>Postline Test Report</h1> | |
| <div class="duration-stats">Time: <span class="value">#{format_berlin_time()}</span></div> | |
| <div class="duration-stats">Average: <span class="value">#{avg_duration}ms</span></div> | |
| <div class="duration-stats">Longest: <span class="value">#{max_duration}ms</span></div> | |
| </div> | |
| <div class="stats-container"> | |
| <div class="stat-card passed" data-filter="success" onclick="filterTests('success')"> | |
| <span class="filter-text">Show Only</span> | |
| <div class="stat-title">Passed</div> | |
| <div class="stat-value">#{passed_tests}/#{total_tests}</div> | |
| <div class="stat-percentage">#{passed_percentage}%</div> | |
| </div> | |
| <div class="stat-card partial" data-filter="partial" onclick="filterTests('partial')"> | |
| <span class="filter-text">Show Only</span> | |
| <div class="stat-title">Partial</div> | |
| <div class="stat-value">#{partial_tests}/#{total_tests}</div> | |
| <div class="stat-percentage">#{partial_percentage}%</div> | |
| </div> | |
| <div class="stat-card failed" data-filter="error" onclick="filterTests('error')"> | |
| <span class="filter-text">Show Only</span> | |
| <div class="stat-title">Failed</div> | |
| <div class="stat-value">#{failed_tests}/#{total_tests}</div> | |
| <div class="stat-percentage">#{failed_percentage}%</div> | |
| </div> | |
| </div> | |
| <div class="filter-controls"> | |
| <label>Filter:</label> | |
| <button id="filter-all" class="active" onclick="filterTests('all')">All Tests</button> | |
| <button id="filter-success" onclick="filterTests('success')">Passed</button> | |
| <button id="filter-partial" onclick="filterTests('partial')">Partial</button> | |
| <button id="filter-error" onclick="filterTests('error')">Failed</button> | |
| </div> | |
| <table> | |
| <thead> | |
| <tr> | |
| <th>Test</th> | |
| <th>Status</th> | |
| <th class="text-center">Duration</th> | |
| <th>View</th> | |
| </tr> | |
| </thead> | |
| <tbody> | |
| #{render_tests(state.tests)} | |
| </tbody> | |
| </table> | |
| <script> | |
| function filterTests(filterType) { | |
| // Reset all filters | |
| document.querySelectorAll('.filter-controls button').forEach(btn => { | |
| btn.classList.remove('active'); | |
| }); | |
| document.querySelectorAll('.stat-card').forEach(card => { | |
| card.classList.remove('active'); | |
| }); | |
| // Set the active button | |
| document.getElementById('filter-' + filterType).classList.add('active'); | |
| // If it's not 'all', set the active card | |
| if (filterType !== 'all') { | |
| document.querySelector('.stat-card[data-filter="' + filterType + '"]').classList.add('active'); | |
| } | |
| // Filter the table rows | |
| const rows = document.querySelectorAll('tbody tr'); | |
| rows.forEach(row => { | |
| const statusBadge = row.querySelector('.badge'); | |
| if (!statusBadge) return; // Skip error rows | |
| if (filterType === 'all') { | |
| row.classList.remove('hide'); | |
| } else { | |
| // Check if the badge matches the filter | |
| const hasClass = statusBadge.classList.contains('badge-' + filterType); | |
| if (hasClass) { | |
| row.classList.remove('hide'); | |
| } else { | |
| row.classList.add('hide'); | |
| } | |
| } | |
| }); | |
| } | |
| // Initialize with all tests showing | |
| window.onload = function() { | |
| filterTests('all'); | |
| }; | |
| </script> | |
| </body> | |
| </html> | |
| """ | |
| end | |
| defp render_tests(tests) do | |
| Enum.map_join(tests, "", fn {{_module, name}, test_data} -> | |
| # Remove "test " from the beginning of the test name | |
| # First, ensure name is a string before trying to replace | |
| display_name = | |
| name | |
| |> to_string() | |
| |> String.replace(~r/^test /, "") | |
| status_badge = | |
| case test_data do | |
| %{error: error} when error != nil -> | |
| ~s(<span class="badge badge-error">Error</span>) | |
| %{started: true, finished: true, used_rescue: true} -> | |
| ~s(<span class="badge badge-partial">Partial</span>) | |
| %{started: true, finished: true} -> | |
| ~s(<span class="badge badge-success">Passed</span>) | |
| %{started: true, finished: false} -> | |
| ~s(<span class="badge badge-pending">Running</span>) | |
| end | |
| duration_badge = create_duration_badge(test_data[:duration_ms]) | |
| llm_link = | |
| case test_data[:llm_report_link] do | |
| nil -> | |
| "" | |
| link -> | |
| llm_count = Enum.count(test_data.llm_calls) | |
| ~s(<a href="#{link}">View \(#{llm_count}\)</a>) | |
| end | |
| error_row = | |
| case test_data.error do | |
| nil -> | |
| "" | |
| error -> | |
| """ | |
| <tr> | |
| <td colspan="4" class="badge-error" style="white-space: pre-wrap;">#{error}</td> | |
| </tr> | |
| """ | |
| end | |
| """ | |
| <tr> | |
| <td>#{display_name}</td> | |
| <td>#{status_badge}</td> | |
| <td class="text-center">#{duration_badge}</td> | |
| <td>#{llm_link}</td> | |
| </tr> | |
| #{error_row} | |
| """ | |
| end) | |
| end | |
| defp create_duration_badge(nil), do: "N/A" | |
| defp create_duration_badge(duration_ms) do | |
| {badge_class, badge_text} = | |
| cond do | |
| duration_ms <= 3000 -> {"badge-duration-fast", "#{duration_ms}ms"} | |
| duration_ms <= 5000 -> {"badge-duration-medium", "#{duration_ms}ms"} | |
| duration_ms <= 8000 -> {"badge-duration-slow", "#{duration_ms}ms"} | |
| true -> {"badge-duration-very-slow", "#{duration_ms}ms"} | |
| end | |
| ~s(<span class="badge #{badge_class}">#{badge_text}</span>) | |
| end | |
| defp format_berlin_time do | |
| # Get current UTC time | |
| utc_now = DateTime.utc_now() | |
| # Determine if we're in DST (roughly - actual calculation is more complex) | |
| # This is a simple approximation for Berlin time | |
| month = utc_now.month | |
| # CEST (+2 hours) | |
| dst_offset = | |
| if month in 3..10 do | |
| 2 | |
| else | |
| # CET (+1 hour) | |
| 1 | |
| end | |
| # Add the Berlin offset to UTC | |
| berlin_time = DateTime.add(utc_now, dst_offset, :hour) | |
| # Format as HH:MM | |
| Calendar.strftime(berlin_time, "%H:%M") | |
| end | |
| @doc """ | |
| Generates HTML content for a single test's LLM calls. | |
| """ | |
| def format_llm_calls_html(module, name, llm_calls) do | |
| # Reverse calls to show oldest first | |
| calls_content = | |
| llm_calls | |
| |> Enum.reverse() | |
| |> Enum.with_index(1) | |
| |> Enum.map_join(&render_single_llm_call/1) | |
| """ | |
| <!DOCTYPE html> | |
| <html> | |
| <head> | |
| <meta charset="UTF-8"> | |
| <meta name="viewport" content="width=device-width, initial-scale=1.0"> | |
| <title>LLM Calls - #{inspect(module)} - #{inspect(name)}</title> | |
| <style> | |
| body { | |
| font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif; | |
| line-height: 1.6; | |
| color: #333; | |
| max-width: 1200px; | |
| margin: 0 auto; | |
| padding: 20px; | |
| } | |
| h1, h2 { | |
| color: #2d3748; | |
| } | |
| .llm-call { | |
| border: 1px solid #e2e8f0; | |
| border-radius: 8px; | |
| margin-bottom: 20px; | |
| background-color: #fff; /* Change background */ | |
| } | |
| .llm-call-header { | |
| background-color: #edf2f7; | |
| padding: 10px 15px; | |
| font-weight: 600; | |
| border-bottom: 1px solid #e2e8f0; | |
| border-top-left-radius: 8px; | |
| border-top-right-radius: 8px; | |
| display: flex; /* Use flex for alignment */ | |
| justify-content: space-between; /* Space out content */ | |
| align-items: center; /* Center items vertically */ | |
| } | |
| .llm-call-body { | |
| padding: 0 15px 15px 15px; /* Adjust padding */ | |
| } | |
| .llm-request-section { | |
| /* Hide request by default */ | |
| /* max-height: 0; | |
| overflow: hidden; */ | |
| /* transition: max-height 0.3s ease-out; */ | |
| /* Let's keep it visible initially and make it collapsible */ | |
| padding-top: 15px; | |
| border-top: 1px dashed #e2e8f0; | |
| margin-top: 15px; | |
| } | |
| /* Style for when request is hidden - Add class via JS */ | |
| .llm-request-section.hidden { | |
| display: none; | |
| } | |
| .message { | |
| border: 1px solid #cbd5e0; | |
| border-radius: 6px; | |
| margin-bottom: 10px; | |
| background-color: #f7fafc; /* Background for messages */ | |
| } | |
| .message-header { | |
| background-color: #e2e8f0; | |
| padding: 5px 10px; | |
| font-weight: 500; | |
| text-transform: capitalize; | |
| border-bottom: 1px solid #cbd5e0; | |
| border-top-left-radius: 6px; | |
| border-top-right-radius: 6px; | |
| } | |
| .message-content { | |
| padding: 10px; | |
| white-space: pre-wrap; /* Preserve whitespace and wrap */ | |
| word-wrap: break-word; | |
| } | |
| .reasoning-content { | |
| padding: 10px; | |
| white-space: pre-wrap; | |
| word-wrap: break-word; | |
| font-size: 0.85em; /* Smaller font for reasoning */ | |
| color: #4a5568; /* Slightly darker text for reasoning */ | |
| background-color: #f0f4f8; /* Slightly different background */ | |
| border-bottom: 1px solid #e2e8f0; | |
| } | |
| .tool-call { | |
| border: 1px dashed #a0aec0; | |
| border-radius: 4px; | |
| padding: 10px; | |
| margin: 10px; | |
| background-color: #f0f4f8; | |
| } | |
| pre { | |
| background-color: #2d3748; | |
| color: #f7fafc; | |
| padding: 15px; | |
| border-radius: 6px; | |
| overflow-x: auto; | |
| } | |
| code { | |
| font-family: "SFMono-Regular", Consolas, "Liberation Mono", Menlo, Courier, monospace; | |
| } | |
| .toggle-button { | |
| background-color: #4a5568; | |
| color: white; | |
| border: none; | |
| padding: 5px 10px; | |
| border-radius: 4px; | |
| cursor: pointer; | |
| font-size: 12px; | |
| } | |
| .toggle-button:hover { | |
| background-color: #2d3748; | |
| } | |
| </style> | |
| </head> | |
| <body> | |
| <h1>LLM Calls</h1> | |
| <h2>#{module}</h2> | |
| <h3>#{name}</h3> | |
| <a href="../test_report.html">← Back to Main Report</a> | |
| #{calls_content} | |
| <script> | |
| document.querySelectorAll('.toggle-button').forEach(button => { | |
| button.addEventListener('click', function() { | |
| const callIndex = this.getAttribute('data-call-index'); | |
| const requestSection = document.getElementById(`request-${callIndex}`); | |
| if (requestSection) { | |
| requestSection.classList.toggle('hidden'); | |
| // Change button text | |
| if (requestSection.classList.contains('hidden')) { | |
| this.textContent = 'Show Request'; | |
| } else { | |
| this.textContent = 'Hide Request'; | |
| } | |
| } | |
| }); | |
| }); | |
| </script> | |
| </body> | |
| </html> | |
| """ | |
| end | |
| defp render_single_llm_call({llm_call, index}) do | |
| request_content = format_request(llm_call.request) | |
| response_content = format_response(llm_call.response) | |
| timestamp_str = llm_call.timestamp |> DateTime.truncate(:second) |> DateTime.to_string() | |
| request_section_id = "request-#{index}" | |
| """ | |
| <div class="llm-call"> | |
| <div class="llm-call-header"> | |
| <span>LLM Call ##{index} (#{timestamp_str})</span> | |
| <button class="toggle-button" data-call-index="#{index}">Show Request</button> | |
| </div> | |
| <div class="llm-call-body"> | |
| <div id="#{request_section_id}" class="llm-request-section hidden"> | |
| <h3>Request</h3> | |
| #{request_content} | |
| </div> | |
| <div> | |
| <h3>Response</h3> | |
| #{response_content} | |
| </div> | |
| </div> | |
| </div> | |
| """ | |
| end | |
| defp format_request(request) do | |
| # Assuming request has a :messages key similar to OpenAI | |
| messages = request[:messages] || request["messages"] || [] | |
| Enum.map_join(messages, &format_message/1) | |
| end | |
| defp format_response(response) do | |
| # Assuming response format similar to OpenAI ChatCompletion | |
| choices = response[:choices] || response["choices"] || [] | |
| Enum.map_join(choices, fn choice -> | |
| message = choice[:message] || choice["message"] | |
| finish_reason = choice[:finish_reason] || choice["finish_reason"] | |
| message_html = format_message(message) | |
| """ | |
| #{message_html} | |
| <p><em>Finish Reason: #{finish_reason || "N/A"}</em></p> | |
| """ | |
| end) | |
| end | |
| defp format_message(message) when is_map(message) do | |
| role = Map.get(message, "role") | |
| content = Map.get(message, "content") | |
| reasoning = Map.get(message, "reasoning") | |
| tool_calls = Map.get(message, "tool_calls") | |
| content_html = "" | |
| # Add reasoning with different styling if present | |
| content_html = | |
| if is_binary(reasoning) and String.trim(reasoning) != "" do | |
| content_html <> ~s(<div class="reasoning-content">#{Plug.HTML.html_escape(reasoning)}</div>) | |
| else | |
| content_html | |
| end | |
| # Add content with regular styling if present | |
| content_html = | |
| if is_binary(content) and String.trim(content) != "" do | |
| content_html <> ~s(<div class="message-content">#{Plug.HTML.html_escape(content)}</div>) | |
| else | |
| content_html <> ~s(<div class="message-content"><em>No text content</em></div>) | |
| end | |
| content_html = | |
| if is_list(tool_calls) and not Enum.empty?(tool_calls) do | |
| content_html <> format_tool_calls(tool_calls) | |
| else | |
| content_html | |
| end | |
| """ | |
| <div class="message"> | |
| <div class="message-header">Role: #{role}</div> | |
| #{content_html} | |
| </div> | |
| """ | |
| end | |
| defp format_message(_), do: "<p><em>Invalid message format</em></p>" | |
| defp format_tool_calls(tool_calls) when is_list(tool_calls) do | |
| Enum.map_join(tool_calls, fn tool_call -> | |
| # Use Map.get with atom/string flexibility | |
| id = Map.get(tool_call, "id") | |
| type = Map.get(tool_call, "type") | |
| function_map = Map.get(tool_call, "function") | |
| name = Map.get(function_map, "name") | |
| arguments = Map.get(function_map, "arguments") | |
| # Attempt to pretty-print arguments if they look like JSON | |
| formatted_args = | |
| case Jason.decode(arguments) do | |
| {:ok, decoded_args} -> | |
| decoded_args | |
| |> Jason.encode!(pretty: true) | |
| |> Plug.HTML.html_escape() | |
| _ -> | |
| Plug.HTML.html_escape(arguments) | |
| end | |
| """ | |
| <div class="tool-call"> | |
| <strong>Tool Call ID:</strong> #{id}<br> | |
| <strong>Type:</strong> #{type}<br> | |
| <strong>Function Name:</strong> #{name}<br> | |
| <strong>Arguments:</strong> | |
| <pre><code>#{formatted_args}</code></pre> | |
| </div> | |
| """ | |
| end) | |
| end | |
| defp format_tool_calls(_), do: "<em>Invalid tool calls format</em>" | |
| end |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| integration_test_config = Application.get_env(:postline, Postline.IntegrationCase) | |
| opts = | |
| if integration_test_config[:enabled], | |
| do: [formatters: [ExUnit.CLIFormatter, Postline.TestReporter]], | |
| else: [] | |
| ExUnit.start(opts) | |
| Ecto.Adapters.SQL.Sandbox.mode(Postline.Repo, :manual) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| defmodule Postline.TestReporter do | |
| @moduledoc false | |
| use GenServer | |
| require Logger | |
| # Client API | |
| def start_link(opts \\ []) do | |
| GenServer.start_link(__MODULE__, opts, name: __MODULE__) | |
| end | |
| @doc """ | |
| Reports an LLM generation call made during a test. | |
| Determines the current test based on the provided module. | |
| Requires the module, the request, and the response. | |
| """ | |
| def report_llm_call(module, request, response) do | |
| GenServer.cast(__MODULE__, {:llm_generation, module, request, response}) | |
| end | |
| @doc """ | |
| Marks the current test as having used a rescue block. This will be used | |
| in the test reporter to show the test as "partial" rather than "passed". | |
| """ | |
| def mark_test_rescue_used(module) do | |
| GenServer.cast(__MODULE__, {:mark_rescue_used, module}) | |
| end | |
| # Server Callbacks | |
| @impl true | |
| def init(_opts) do | |
| state = %{ | |
| # Structure: {module, name} => %{started: bool, finished: bool, llm_calls: list} | |
| tests: %{} | |
| } | |
| Process.register(self(), __MODULE__) | |
| {:ok, state} | |
| end | |
| @impl true | |
| def handle_cast({:suite_finished, _times_used}, state) do | |
| # Generate HTML report and save it to a file | |
| save_html_report(state) | |
| {:noreply, state} | |
| end | |
| @impl true | |
| def handle_cast({event, %{state: {:excluded, _reason}}}, state) when event in [:test_started, :test_finished] do | |
| {:noreply, state} | |
| end | |
| @impl true | |
| def handle_cast({:test_started, test}, state) do | |
| test_key = {test.module, test.name} | |
| # Initialize test state with current timestamp | |
| start_time = System.monotonic_time() | |
| new_state = | |
| put_in(state, [:tests, test_key], %{ | |
| started: true, | |
| finished: false, | |
| error: nil, | |
| llm_calls: [], | |
| used_rescue: false, | |
| start_time: start_time | |
| }) | |
| {:noreply, new_state} | |
| end | |
| @impl true | |
| def handle_cast({:test_finished, test}, state) do | |
| test_key = {test.module, test.name} | |
| end_time = System.monotonic_time() | |
| # Calculate duration in milliseconds | |
| start_time = get_in(state, [:tests, test_key, :start_time]) || end_time | |
| duration_ms = System.convert_time_unit(end_time - start_time, :native, :millisecond) | |
| # Print result to terminal | |
| test_name = "#{inspect(test.module)} #{test.name}" | |
| status = | |
| case test.state do | |
| {:failed, _} -> IO.ANSI.red() <> "FAILED" <> IO.ANSI.reset() | |
| _ -> IO.ANSI.green() <> "PASSED" <> IO.ANSI.reset() | |
| end | |
| # Only print for completed tests to avoid interference with ExUnit's own output | |
| if test.state != nil do | |
| IO.puts("#{test_name} (#{duration_ms}ms): #{status}") | |
| end | |
| # Mark test as finished | |
| error = | |
| case test.state do | |
| {:failed, [{:error, error, stacktrace}]} -> | |
| file_link = | |
| case stacktrace do | |
| [{_mod, _func, _arity, [file: file, line: line]}] -> | |
| "<a href=\"cursor://file/#{Path.absname(file)}:#{line}\">#{file}:#{line}</a>" | |
| _ -> | |
| "" | |
| end | |
| message = | |
| case error do | |
| %ExUnit.AssertionError{left: left, right: right} when left != :ex_unit_no_meaningful_value -> | |
| "Expected to be equal:<br>Left: #{inspect(left)}<br>Right: #{inspect(right)}" | |
| %{message: message} when is_binary(message) -> | |
| message | |
| error -> | |
| inspect(error) | |
| end | |
| file_link <> "<br>" <> message | |
| {:failed, _} -> | |
| "Unknown error" | |
| _ -> | |
| nil | |
| end | |
| new_state = | |
| update_in( | |
| state, | |
| [:tests, test_key], | |
| &(&1 | |
| |> Map.put(:finished, true) | |
| |> Map.put(:error, error) | |
| |> Map.put(:duration_ms, duration_ms)) | |
| ) | |
| {:noreply, new_state} | |
| end | |
| @impl true | |
| def handle_cast({:mark_rescue_used, module}, state) do | |
| # Find the currently running test for the given module | |
| test_key = | |
| Enum.find_value(state.tests, fn | |
| {{^module, test_name}, %{started: true, finished: false}} -> {module, test_name} | |
| _ -> nil | |
| end) | |
| case test_key do | |
| nil -> | |
| # No running test found for this module | |
| Logger.error( | |
| "Attempted to mark rescue used for module #{inspect(module)}, but no test is currently running for it." | |
| ) | |
| {:noreply, state} | |
| test_key -> | |
| # Found the running test, mark it as having used a rescue block | |
| new_state = put_in(state, [:tests, test_key, :used_rescue], true) | |
| {:noreply, new_state} | |
| end | |
| end | |
| @impl true | |
| def handle_cast({:llm_generation, module, request, response}, state) do | |
| # Find the currently running test for the given module | |
| test_key = | |
| Enum.find_value(state.tests, fn | |
| {{^module, test_name}, %{started: true, finished: false}} -> {module, test_name} | |
| _ -> nil | |
| end) | |
| case test_key do | |
| nil -> | |
| # No running test found for this module | |
| Logger.error( | |
| "Attempted to report LLM call for module #{inspect(module)}, but no test is currently running for it." | |
| ) | |
| {:noreply, state} | |
| test_key -> | |
| # Found the running test, append the LLM call | |
| llm_call_data = %{request: request, response: response, timestamp: DateTime.utc_now()} | |
| new_state = update_in(state, [:tests, test_key, :llm_calls], fn calls -> [llm_call_data | calls] end) | |
| {:noreply, new_state} | |
| end | |
| end | |
| # catch all other messages | |
| def handle_cast(_, state), do: {:noreply, state} | |
| # Private functions | |
| defp save_html_report(state) do | |
| # Create reports directories if they don't exist | |
| base_reports_dir = "reports" | |
| llm_calls_dir = Path.join(base_reports_dir, "llm_calls") | |
| File.mkdir_p!(base_reports_dir) | |
| File.mkdir_p!(llm_calls_dir) | |
| # Generate individual LLM call reports and collect links | |
| tests_with_links = | |
| Map.new(state.tests, fn {{module, name} = test_key, test_data} -> | |
| if Enum.empty?(test_data.llm_calls) do | |
| {test_key, test_data} | |
| else | |
| # Generate a unique, filesystem-safe filename | |
| sanitized_module = module |> Module.split() |> Enum.join("_") | |
| sanitized_name = name |> Atom.to_string() |> String.replace(~r/[^\w\-]+/, "_") | |
| report_filename = "test_#{sanitized_module}_#{sanitized_name}.html" | |
| report_filepath = Path.join(llm_calls_dir, report_filename) | |
| # Link relative to main report | |
| relative_link = Path.join("llm_calls", report_filename) | |
| # Generate and save individual report | |
| llm_html = Postline.TestFormatter.format_llm_calls_html(module, name, test_data.llm_calls) | |
| File.write!(report_filepath, llm_html) | |
| # Add link to test data for the main formatter | |
| updated_test_data = Map.put(test_data, :llm_report_link, relative_link) | |
| {test_key, updated_test_data} | |
| end | |
| end) | |
| # Prepare state with updated test data (including links) | |
| updated_state = %{state | tests: tests_with_links} | |
| # Generate main report filename | |
| main_report_filename = Path.join(base_reports_dir, "test_report.html") | |
| # Generate main HTML content using updated state | |
| main_html_content = Postline.TestFormatter.format_html(updated_state) | |
| # Write main report to file | |
| File.write!(main_report_filename, main_html_content) | |
| IO.puts("Test report saved to file://#{Path.absname(main_report_filename)}") | |
| # Note: Individual LLM reports are in #{llm_calls_dir} | |
| end | |
| end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment