Valian · June 24, 2025 10:30
diff --git a/integration_case.ex b/integration_case.ex
 defmodule Postline.IntegrationCase do
  @moduledoc """
  This module defines the setup for tests requiring
  access to real external services like OpenAI.

  Integration tests are meant to test actual integration with
  third-party services and are not run by default with `mix test`.

  Use `mix test.integration` to run integration tests.

  These tests can incur costs as they make real API calls!
  """
  
  use ExUnit.CaseTemplate

  alias Ecto.Adapters.SQL.Sandbox
  
  using do
    quote do
      # Import all the same test helpers as DataCase
      # ...
      import ExUnit.CaptureLog
      import Postline.IntegrationCase

      # Tag all integration tests
      @moduletag :integration

      def chat_completion(prompt_module, opts) do
        Postline.IntegrationCase.chat_completion(__MODULE__, prompt_module, opts)
      end
    end
  end
  
  
  setup tags do
    # Skip integration tests unless INTEGRATION_TEST=true
    # DataCase has a reverse condition
    if Application.get_env(:postline, Postline.IntegrationCase)[:enabled] != true do
      raise "INTEGRATION_TEST can't be run unless INTEGRATION_TEST=true"
    end

    Postline.IntegrationCase.setup_sandbox(tags)
    :ok
  end
  
  @doc """
  Sets up the sandbox based on the test tags.
  """
  def setup_sandbox(tags) do
    pid = Sandbox.start_owner!(Postline.Repo, shared: not tags[:async])
    on_exit(fn -> Sandbox.stop_owner(pid) end)
  end
  
  def llm_judge(test_module, question, {:ok, result}), do: llm_judge(test_module, question, result)

  def llm_judge(test_module, question, %{"choices" => [%{"message" => msg}]}),
    do: llm_judge(test_module, question, Jason.encode!(msg, pretty: true))

  def llm_judge(test_module, question, output) do
    inputs = %{question: question, output: output}

    with {:ok, %{"choices" => [%{"message" => %{"content" => content}}]}} <-
           chat_completion(test_module, Postline.Prompts.Library.LLMJudge, inputs: inputs),
         {:ok, %{"pass" => pass, "reason" => reason, "score" => score}} <- Jason.decode(content) do
      %{pass: pass, reason: reason, score: score}
    else
      {:error, error} -> throw("Invalid return value from LLM judge: #{inspect(error)}")
      error -> throw("Invalid return value from LLM judge: #{inspect(error)}")
    end
  end
  
  
  @doc """
  In Postline, prompts are module-based.
  
  Each such prompt has a format method, which takes inputs and some optional configuration
  and returns a request to be sent to the LLM. 
  We're using OpenRouter under the hood. This is something you'll want to modify. 
  """
  def chat_completion(test_module, prompt_module, opts) do
    # put your default templating inputs here
    default_inputs = %{}
    opts = Keyword.update(opts, :inputs, default_inputs, &Map.merge(default_inputs, &1))
     
    # format prompt  
    with {:ok, request} <- prompt_module.format(opts),
         {:ok, response} <- Postline.OpenRouter.chat_completion(request) do
      Postline.TestReporter.report_llm_call(test_module, request, response)
      {:ok, response}
    end
  end
  
  @doc """
  Asserts that an LLM response passes a specific evaluation criteria.

  ## Examples

      # Simple assertion
      assert_llm "Is it polite?", result

      # With minimum score requirement
      assert_llm "Is it polite?", result, min_score: 0.8

      # With custom error message
      assert_llm "Is it polite?", result, message: "Response must be polite"

  ## Options

    * `:min_score` - Minimum score required to pass (default: 0.5)
    * `:message` - Custom error message to display on failure
  """
  defmacro assert_llm(question, output, opts \\ []) do
    min_score = Keyword.get(opts, :min_score, 0.5)
    custom_message = Keyword.get(opts, :message, nil)

    quote do
      result = llm_judge(__MODULE__, unquote(question), unquote(output))
      question = unquote(question)
      min_score = unquote(min_score)
      custom_message = unquote(custom_message)

      if !(result.pass && result.score >= min_score) do
        message = """
        LLM response failed evaluation:
        Question: #{question}
        Output: #{inspect(unquote(output))}
        Pass: #{result.pass}
        Score: #{result.score} (minimum: #{min_score})
        Reason: #{result.reason}
        """

        message = if custom_message, do: message <> "\n\n#{inspect(custom_message)}", else: message

        flunk(message)
      end

      # Return the result for potential further assertions
      result
    end
  end
 end
diff --git a/llm_judge.ex b/llm_judge.ex
 defmodule Postline.Prompts.Library.LLMJudge do
  @moduledoc false

  # Import the SigilLiquid module to use the ~SYSTEM, ~USER, ~LIQUID sigils
  use Postline.Prompts.Prompt

  @response_format %{
    "type" => "json_schema",
    "json_schema" => %{
      "name" => "results",
      "schema" => %{
        "type" => "object",
        "required" => ["reason", "score", "pass"],
        "properties" => %{
          "reason" => %{
            "type" => "string",
            "description" => "Analysis of the rubric and the output"
          },
          "score" => %{
            "type" => "number",
            "description" => "Score between 0.0 and 1.0"
          },
          "pass" => %{
            "type" => "boolean",
            "description" => "Whether the output passes the evaluation"
          }
        }
      }
    }
  }

  @doc """
  Returns the parameters for the prompt.

  Valid parameters are:
  - model: The model to use for the prompt.
  - temperature: The temperature to use for the prompt.
  - top_p: The top_p to use for the prompt.
  - top_k: The top_k to use for the prompt.
  - max_tokens: The max_tokens to use for the prompt.

  For full list of parameters, see the OpenRouter API documentation:
  https://openrouter.ai/docs/api-reference/parameters
  """
  def params do
    %{model: "google/gemini-2.0-flash-001", temperature: 0.0, response_format: @response_format}
  end

  @doc """
  Returns the messages for the prompt.
  """
  def messages do
    [
      ~SYSTEM"""
      You are an expert judge tasked with evaluating the quality of LLM-generated outputs against specific criteria.

      Your job is to analyze the provided output and determine if it meets the requirements specified in the rubric. You must be fair, consistent, and thorough in your evaluation.

      For each evaluation, you will:
      1. Carefully analyze the output against the provided rubric
      2. Provide a detailed explanation of your reasoning in the "reason" field
      3. Assign a score between 0.0 (completely fails) and 1.0 (perfectly meets criteria)
      4. Make a clear pass/fail determination

      Below you will find:
      1. The rubric with criteria for evaluation
      2. The output that needs to be evaluated

      RUBRIC:
      {{ question }}

      OUTPUT TO EVALUATE:
      ```
      {{ output }}
      ```

      Your response must be structured as a JSON object with the following fields:
      - "reason": A detailed analysis explaining how the output meets or fails to meet the criteria
      - "score": A number between 0.0 and 1.0 representing the quality of the output
      - "pass": A boolean (true/false) indicating whether the output passes the evaluation

      Be objective and focus solely on the criteria provided in the rubric. Do not introduce your own criteria or biases.
      """
    ]
  end

  @doc """
  Returns the tools for the prompt.
  """
  def tools do
    []
  end
 end
diff --git a/test_formatter.ex b/test_formatter.ex
 defmodule Postline.TestFormatter do
  @moduledoc """
  Formats test stats as HTML for static report generation.
  """

  @doc """
  Converts test state into formatted HTML.
  """
  def format_html(state) do
    # Calculate summary statistics
    total_tests = Enum.count(state.tests)

    completed_tests =
      Enum.count(state.tests, fn {_key, test_data} ->
        test_data.started && test_data.finished && test_data.error == nil
      end)

    partial_tests =
      Enum.count(state.tests, fn {_key, test_data} ->
        test_data.started && test_data.finished && test_data.error == nil && test_data.used_rescue
      end)

    failed_tests =
      Enum.count(state.tests, fn {_key, test_data} ->
        test_data.error != nil
      end)

    # All completed tests are passed (including partials)
    passed_tests = completed_tests
    # Fully passed tests are those without rescue
    _fully_passed_tests = passed_tests - partial_tests

    # Calculate duration statistics
    durations =
      Enum.flat_map(state.tests, fn {_key, test_data} ->
        case test_data[:duration_ms] do
          nil -> []
          duration -> [duration]
        end
      end)

    avg_duration =
      if Enum.empty?(durations) do
        0
      else
        round(Enum.sum(durations) / length(durations))
      end

    max_duration = if Enum.empty?(durations), do: 0, else: Enum.max(durations)

    passed_percentage = if total_tests > 0, do: Float.round(passed_tests / total_tests * 100, 1), else: 0.0
    partial_percentage = if total_tests > 0, do: Float.round(partial_tests / total_tests * 100, 1), else: 0.0
    failed_percentage = if total_tests > 0, do: Float.round(failed_tests / total_tests * 100, 1), else: 0.0

    """
    <!DOCTYPE html>
    <html>
      <head>
        <meta charset="UTF-8">
        <meta name="viewport" content="width=device-width, initial-scale=1.0">
        <title>Postline Test Report</title>
        <style>
          body {
            font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif;
            line-height: 1.6;
            color: #333;
            max-width: 1200px;
            margin: 0 auto;
            padding: 20px;
            background-color: #f8f9fa;
          }
          h1, h2, h3 {
            color: #2d3748;
          }
          .header {
            margin-bottom: 30px;
          }
          .timestamp {
            color: #718096;
            font-size: 14px;
            margin-top: 5px;
            margin-bottom: 4px;
          }
          .duration-stats {
            color: #718096;
            font-size: 14px;
            margin-bottom: 2px;
          }
          .duration-stats .value {
            font-weight: 600;
            color: #4a5568;
          }
          .stats-container {
            display: grid;
            grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
            gap: 20px;
            margin: 25px 0;
          }
          .stat-card {
            background-color: white;
            border-radius: 8px;
            padding: 15px 20px;
            box-shadow: 0 1px 3px rgba(0,0,0,0.1);
            cursor: pointer;
            position: relative;
            transition: transform 0.1s ease-in-out;
          }
          .stat-card:hover {
            transform: translateY(-2px);
            box-shadow: 0 2px 5px rgba(0,0,0,0.15);
          }
          .stat-card.active {
            box-shadow: 0 0 0 2px #4299e1;
          }
          .stat-card .filter-text {
            position: absolute;
            top: 5px;
            right: 10px;
            font-size: 11px;
            color: #718096;
            font-weight: 500;
            display: none;
          }
          .stat-card.active .filter-text {
            display: block;
            color: #4299e1;
          }
          .stat-card.passed {
            border-left: 5px solid #38a169;
          }
          .stat-card.partial {
            border-left: 5px solid #d69e2e;
          }
          .stat-card.failed {
            border-left: 5px solid #e53e3e;
          }
          .stat-title {
            font-size: 14px;
            color: #718096;
            margin-bottom: 5px;
          }
          .stat-value {
            font-size: 22px;
            font-weight: 600;
            margin-bottom: 3px;
          }
          .stat-percentage {
            font-size: 14px;
            color: #718096;
          }
          table {
            border-collapse: collapse;
            width: 100%;
            margin: 20px 0;
            background-color: white;
            border-radius: 8px;
            overflow: hidden;
            box-shadow: 0 1px 3px rgba(0,0,0,0.1);
            table-layout: fixed;
          }
          th, td {
            padding: 12px 16px;
            border-bottom: 1px solid #e2e8f0;
            vertical-align: middle;
            white-space: nowrap;
          }
          th {
            background-color: #f7fafc;
            font-weight: 600;
            color: #4a5568;
          }
          /* Column sizing */
          th:first-child, td:first-child {
            width: 70%;
            text-align: left;
            white-space: normal;
            overflow: hidden;
            text-overflow: ellipsis;
          }
          th:nth-child(2), td:nth-child(2) {
            width: 10%;
            text-align: right;
          }
          th:nth-child(3), td:nth-child(3) {
            width: 10%;
            text-align: center;
          }
          th:nth-child(4), td:nth-child(4) {
            width: 10%;
            text-align: right;
          }
          tr:last-child td {
            border-bottom: none;
          }
          tr:hover {
            background-color: #f7fafc;
          }
          .badge {
            display: inline-block;
            padding: 4px 8px;
            border-radius: 4px;
            font-size: 12px;
            font-weight: 600;
          }
          .badge-success {
            background-color: #c6f6d5;
            color: #22543d;
          }
          .badge-partial {
            background-color: #fefcbf;
            color: #744210;
          }
          .badge-pending {
            background-color: #e9d8fd;
            color: #553c9a;
          }
          .badge-error {
            background-color: #fed7d7;
            color: #9b2c2c;
          }
          .badge-duration-fast {
            background-color: #c6f6d5;
            color: #22543d;
          }
          .badge-duration-medium {
            background-color: #fefcbf;
            color: #744210;
          }
          .badge-duration-slow {
            background-color: #fed7d7;
            color: #9b2c2c;
          }
          .badge-duration-very-slow {
            background-color: #2d3748;
            color: #ffffff;
          }
          a {
            color: #4299e1;
            text-decoration: none;
          }
          a:hover {
            text-decoration: underline;
          }
          .filter-controls {
            display: flex;
            justify-content: flex-end;
            margin-bottom: 10px;
            align-items: center;
          }
          .filter-controls label {
            margin-right: 8px;
            font-size: 14px;
            color: #4a5568;
          }
          .filter-controls button {
            background-color: #edf2f7;
            border: none;
            padding: 6px 12px;
            margin-left: 5px;
            border-radius: 4px;
            font-size: 12px;
            font-weight: 500;
            color: #4a5568;
            cursor: pointer;
            transition: all 0.2s;
          }
          .filter-controls button:hover {
            background-color: #e2e8f0;
          }
          .filter-controls button.active {
            background-color: #4299e1;
            color: white;
          }
          tr.hide {
            display: none;
          }
          .text-right {
            text-align: right;
          }
          .text-center {
            text-align: center;
          }
        </style>
      </head>
      <body>
        <div class="header">
          <h1>Postline Test Report</h1>
          <div class="duration-stats">Time: <span class="value">#{format_berlin_time()}</span></div>
          <div class="duration-stats">Average: <span class="value">#{avg_duration}ms</span></div>
          <div class="duration-stats">Longest: <span class="value">#{max_duration}ms</span></div>
        </div>

        <div class="stats-container">
          <div class="stat-card passed" data-filter="success" onclick="filterTests('success')">
            <span class="filter-text">Show Only</span>
            <div class="stat-title">Passed</div>
            <div class="stat-value">#{passed_tests}/#{total_tests}</div>
            <div class="stat-percentage">#{passed_percentage}%</div>
          </div>
          <div class="stat-card partial" data-filter="partial" onclick="filterTests('partial')">
            <span class="filter-text">Show Only</span>
            <div class="stat-title">Partial</div>
            <div class="stat-value">#{partial_tests}/#{total_tests}</div>
            <div class="stat-percentage">#{partial_percentage}%</div>
          </div>
          <div class="stat-card failed" data-filter="error" onclick="filterTests('error')">
            <span class="filter-text">Show Only</span>
            <div class="stat-title">Failed</div>
            <div class="stat-value">#{failed_tests}/#{total_tests}</div>
            <div class="stat-percentage">#{failed_percentage}%</div>
          </div>
        </div>

        <div class="filter-controls">
          <label>Filter:</label>
          <button id="filter-all" class="active" onclick="filterTests('all')">All Tests</button>
          <button id="filter-success" onclick="filterTests('success')">Passed</button>
          <button id="filter-partial" onclick="filterTests('partial')">Partial</button>
          <button id="filter-error" onclick="filterTests('error')">Failed</button>
        </div>

        <table>
          <thead>
            <tr>
              <th>Test</th>
              <th>Status</th>
              <th class="text-center">Duration</th>
              <th>View</th>
            </tr>
          </thead>
          <tbody>
            #{render_tests(state.tests)}
          </tbody>
        </table>

        <script>
          function filterTests(filterType) {
            // Reset all filters
            document.querySelectorAll('.filter-controls button').forEach(btn => {
              btn.classList.remove('active');
            });
            document.querySelectorAll('.stat-card').forEach(card => {
              card.classList.remove('active');
            });

            // Set the active button
            document.getElementById('filter-' + filterType).classList.add('active');

            // If it's not 'all', set the active card
            if (filterType !== 'all') {
              document.querySelector('.stat-card[data-filter="' + filterType + '"]').classList.add('active');
            }

            // Filter the table rows
            const rows = document.querySelectorAll('tbody tr');

            rows.forEach(row => {
              const statusBadge = row.querySelector('.badge');
              if (!statusBadge) return; // Skip error rows

              if (filterType === 'all') {
                row.classList.remove('hide');
              } else {
                // Check if the badge matches the filter
                const hasClass = statusBadge.classList.contains('badge-' + filterType);
                if (hasClass) {
                  row.classList.remove('hide');
                } else {
                  row.classList.add('hide');
                }
              }
            });
          }

          // Initialize with all tests showing
          window.onload = function() {
            filterTests('all');
          };
        </script>
      </body>
    </html>
    """
  end

  defp render_tests(tests) do
    Enum.map_join(tests, "", fn {{_module, name}, test_data} ->
      # Remove "test " from the beginning of the test name
      # First, ensure name is a string before trying to replace
      display_name =
        name
        |> to_string()
        |> String.replace(~r/^test /, "")

      status_badge =
        case test_data do
          %{error: error} when error != nil ->
            ~s(<span class="badge badge-error">Error</span>)

          %{started: true, finished: true, used_rescue: true} ->
            ~s(<span class="badge badge-partial">Partial</span>)

          %{started: true, finished: true} ->
            ~s(<span class="badge badge-success">Passed</span>)

          %{started: true, finished: false} ->
            ~s(<span class="badge badge-pending">Running</span>)
        end

      duration_badge = create_duration_badge(test_data[:duration_ms])

      llm_link =
        case test_data[:llm_report_link] do
          nil ->
            ""

          link ->
            llm_count = Enum.count(test_data.llm_calls)
            ~s(<a href="#{link}">View \(#{llm_count}\)</a>)
        end

      error_row =
        case test_data.error do
          nil ->
            ""

          error ->
            """
              <tr>
                <td colspan="4" class="badge-error" style="white-space: pre-wrap;">#{error}</td>
              </tr>
            """
        end

      """
      <tr>
        <td>#{display_name}</td>
        <td>#{status_badge}</td>
        <td class="text-center">#{duration_badge}</td>
        <td>#{llm_link}</td>
      </tr>
      #{error_row}
      """
    end)
  end

  defp create_duration_badge(nil), do: "N/A"

  defp create_duration_badge(duration_ms) do
    {badge_class, badge_text} =
      cond do
        duration_ms <= 3000 -> {"badge-duration-fast", "#{duration_ms}ms"}
        duration_ms <= 5000 -> {"badge-duration-medium", "#{duration_ms}ms"}
        duration_ms <= 8000 -> {"badge-duration-slow", "#{duration_ms}ms"}
        true -> {"badge-duration-very-slow", "#{duration_ms}ms"}
      end

    ~s(<span class="badge #{badge_class}">#{badge_text}</span>)
  end

  defp format_berlin_time do
    # Get current UTC time
    utc_now = DateTime.utc_now()

    # Determine if we're in DST (roughly - actual calculation is more complex)
    # This is a simple approximation for Berlin time
    month = utc_now.month

    # CEST (+2 hours)
    dst_offset =
      if month in 3..10 do
        2
      else
        # CET (+1 hour)
        1
      end

    # Add the Berlin offset to UTC
    berlin_time = DateTime.add(utc_now, dst_offset, :hour)

    # Format as HH:MM
    Calendar.strftime(berlin_time, "%H:%M")
  end

  @doc """
  Generates HTML content for a single test's LLM calls.
  """
  def format_llm_calls_html(module, name, llm_calls) do
    # Reverse calls to show oldest first
    calls_content =
      llm_calls
      |> Enum.reverse()
      |> Enum.with_index(1)
      |> Enum.map_join(&render_single_llm_call/1)

    """
    <!DOCTYPE html>
    <html>
      <head>
        <meta charset="UTF-8">
        <meta name="viewport" content="width=device-width, initial-scale=1.0">
        <title>LLM Calls - #{inspect(module)} - #{inspect(name)}</title>
        <style>
          body {
            font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif;
            line-height: 1.6;
            color: #333;
            max-width: 1200px;
            margin: 0 auto;
            padding: 20px;
          }
          h1, h2 {
            color: #2d3748;
          }
          .llm-call {
            border: 1px solid #e2e8f0;
            border-radius: 8px;
            margin-bottom: 20px;
            background-color: #fff; /* Change background */
          }
          .llm-call-header {
            background-color: #edf2f7;
            padding: 10px 15px;
            font-weight: 600;
            border-bottom: 1px solid #e2e8f0;
            border-top-left-radius: 8px;
            border-top-right-radius: 8px;
            display: flex; /* Use flex for alignment */
            justify-content: space-between; /* Space out content */
            align-items: center; /* Center items vertically */
          }
          .llm-call-body {
            padding: 0 15px 15px 15px; /* Adjust padding */
          }
          .llm-request-section {
             /* Hide request by default */
            /* max-height: 0;
            overflow: hidden; */
            /* transition: max-height 0.3s ease-out; */
            /* Let's keep it visible initially and make it collapsible */
            padding-top: 15px;
            border-top: 1px dashed #e2e8f0;
            margin-top: 15px;
          }
          /* Style for when request is hidden - Add class via JS */
           .llm-request-section.hidden {
             display: none;
           }
          .message {
            border: 1px solid #cbd5e0;
            border-radius: 6px;
            margin-bottom: 10px;
            background-color: #f7fafc; /* Background for messages */
          }
          .message-header {
            background-color: #e2e8f0;
            padding: 5px 10px;
            font-weight: 500;
            text-transform: capitalize;
            border-bottom: 1px solid #cbd5e0;
            border-top-left-radius: 6px;
            border-top-right-radius: 6px;
          }
          .message-content {
            padding: 10px;
            white-space: pre-wrap; /* Preserve whitespace and wrap */
            word-wrap: break-word;
          }
          .reasoning-content {
            padding: 10px;
            white-space: pre-wrap;
            word-wrap: break-word;
            font-size: 0.85em; /* Smaller font for reasoning */
            color: #4a5568; /* Slightly darker text for reasoning */
            background-color: #f0f4f8; /* Slightly different background */
            border-bottom: 1px solid #e2e8f0;
          }
          .tool-call {
            border: 1px dashed #a0aec0;
            border-radius: 4px;
            padding: 10px;
            margin: 10px;
            background-color: #f0f4f8;
          }
          pre {
            background-color: #2d3748;
            color: #f7fafc;
            padding: 15px;
            border-radius: 6px;
            overflow-x: auto;
          }
          code {
            font-family: "SFMono-Regular", Consolas, "Liberation Mono", Menlo, Courier, monospace;
          }
          .toggle-button {
            background-color: #4a5568;
            color: white;
            border: none;
            padding: 5px 10px;
            border-radius: 4px;
            cursor: pointer;
            font-size: 12px;
          }
          .toggle-button:hover {
            background-color: #2d3748;
          }
        </style>
      </head>
      <body>
        <h1>LLM Calls</h1>
        <h2>#{module}</h2>
        <h3>#{name}</h3>
        <a href="../test_report.html">&larr; Back to Main Report</a>
        #{calls_content}

        <script>
          document.querySelectorAll('.toggle-button').forEach(button => {
            button.addEventListener('click', function() {
              const callIndex = this.getAttribute('data-call-index');
              const requestSection = document.getElementById(`request-${callIndex}`);

              if (requestSection) {
                 requestSection.classList.toggle('hidden');
                 // Change button text
                 if (requestSection.classList.contains('hidden')) {
                    this.textContent = 'Show Request';
                 } else {
                    this.textContent = 'Hide Request';
                 }
              }
            });
          });
        </script>
      </body>
    </html>
    """
  end

  defp render_single_llm_call({llm_call, index}) do
    request_content = format_request(llm_call.request)
    response_content = format_response(llm_call.response)
    timestamp_str = llm_call.timestamp |> DateTime.truncate(:second) |> DateTime.to_string()
    request_section_id = "request-#{index}"

    """
    <div class="llm-call">
      <div class="llm-call-header">
        <span>LLM Call ##{index} (#{timestamp_str})</span>
        <button class="toggle-button" data-call-index="#{index}">Show Request</button>
      </div>
      <div class="llm-call-body">
        <div id="#{request_section_id}" class="llm-request-section hidden">
          <h3>Request</h3>
          #{request_content}
        </div>
        <div>
          <h3>Response</h3>
          #{response_content}
        </div>
      </div>
    </div>
    """
  end

  defp format_request(request) do
    # Assuming request has a :messages key similar to OpenAI
    messages = request[:messages] || request["messages"] || []
    Enum.map_join(messages, &format_message/1)
  end

  defp format_response(response) do
    # Assuming response format similar to OpenAI ChatCompletion
    choices = response[:choices] || response["choices"] || []

    Enum.map_join(choices, fn choice ->
      message = choice[:message] || choice["message"]
      finish_reason = choice[:finish_reason] || choice["finish_reason"]

      message_html = format_message(message)

      """
      #{message_html}
      <p><em>Finish Reason: #{finish_reason || "N/A"}</em></p>
      """
    end)
  end

  defp format_message(message) when is_map(message) do
    role = Map.get(message, "role")
    content = Map.get(message, "content")
    reasoning = Map.get(message, "reasoning")
    tool_calls = Map.get(message, "tool_calls")

    content_html = ""

    # Add reasoning with different styling if present
    content_html =
      if is_binary(reasoning) and String.trim(reasoning) != "" do
        content_html <> ~s(<div class="reasoning-content">#{Plug.HTML.html_escape(reasoning)}</div>)
      else
        content_html
      end

    # Add content with regular styling if present
    content_html =
      if is_binary(content) and String.trim(content) != "" do
        content_html <> ~s(<div class="message-content">#{Plug.HTML.html_escape(content)}</div>)
      else
        content_html <> ~s(<div class="message-content"><em>No text content</em></div>)
      end

    content_html =
      if is_list(tool_calls) and not Enum.empty?(tool_calls) do
        content_html <> format_tool_calls(tool_calls)
      else
        content_html
      end

    """
    <div class="message">
      <div class="message-header">Role: #{role}</div>
      #{content_html}
    </div>
    """
  end

  defp format_message(_), do: "<p><em>Invalid message format</em></p>"

  defp format_tool_calls(tool_calls) when is_list(tool_calls) do
    Enum.map_join(tool_calls, fn tool_call ->
      # Use Map.get with atom/string flexibility
      id = Map.get(tool_call, "id")
      type = Map.get(tool_call, "type")
      function_map = Map.get(tool_call, "function")
      name = Map.get(function_map, "name")
      arguments = Map.get(function_map, "arguments")

      # Attempt to pretty-print arguments if they look like JSON
      formatted_args =
        case Jason.decode(arguments) do
          {:ok, decoded_args} ->
            decoded_args
            |> Jason.encode!(pretty: true)
            |> Plug.HTML.html_escape()

          _ ->
            Plug.HTML.html_escape(arguments)
        end

      """
      <div class="tool-call">
        <strong>Tool Call ID:</strong> #{id}<br>
        <strong>Type:</strong> #{type}<br>
        <strong>Function Name:</strong> #{name}<br>
        <strong>Arguments:</strong>
        <pre><code>#{formatted_args}</code></pre>
      </div>
      """
    end)
  end

  defp format_tool_calls(_), do: "<em>Invalid tool calls format</em>"
 end
diff --git a/test_helper.exs b/test_helper.exs
 integration_test_config = Application.get_env(:postline, Postline.IntegrationCase)
 opts =
  if integration_test_config[:enabled],
    do: [formatters: [ExUnit.CLIFormatter, Postline.TestReporter]],
    else: []

 ExUnit.start(opts)
 Ecto.Adapters.SQL.Sandbox.mode(Postline.Repo, :manual)
diff --git a/test_reporter.ex b/test_reporter.ex
 defmodule Postline.TestReporter do
  @moduledoc false
  use GenServer

  require Logger

  # Client API

  def start_link(opts \\ []) do
    GenServer.start_link(__MODULE__, opts, name: __MODULE__)
  end

  @doc """
  Reports an LLM generation call made during a test.
  Determines the current test based on the provided module.
  Requires the module, the request, and the response.
  """
  def report_llm_call(module, request, response) do
    GenServer.cast(__MODULE__, {:llm_generation, module, request, response})
  end
  
  @doc """
  Marks the current test as having used a rescue block. This will be used
  in the test reporter to show the test as "partial" rather than "passed".
  """
  def mark_test_rescue_used(module) do
    GenServer.cast(__MODULE__, {:mark_rescue_used, module})
  end

  # Server Callbacks

  @impl true
  def init(_opts) do
    state = %{
      # Structure: {module, name} => %{started: bool, finished: bool, llm_calls: list}
      tests: %{}
    }

    Process.register(self(), __MODULE__)
    {:ok, state}
  end

  @impl true
  def handle_cast({:suite_finished, _times_used}, state) do
    # Generate HTML report and save it to a file
    save_html_report(state)

    {:noreply, state}
  end

  @impl true
  def handle_cast({event, %{state: {:excluded, _reason}}}, state) when event in [:test_started, :test_finished] do
    {:noreply, state}
  end

  @impl true
  def handle_cast({:test_started, test}, state) do
    test_key = {test.module, test.name}
    # Initialize test state with current timestamp
    start_time = System.monotonic_time()

    new_state =
      put_in(state, [:tests, test_key], %{
        started: true,
        finished: false,
        error: nil,
        llm_calls: [],
        used_rescue: false,
        start_time: start_time
      })

    {:noreply, new_state}
  end

  @impl true
  def handle_cast({:test_finished, test}, state) do
    test_key = {test.module, test.name}
    end_time = System.monotonic_time()

    # Calculate duration in milliseconds
    start_time = get_in(state, [:tests, test_key, :start_time]) || end_time
    duration_ms = System.convert_time_unit(end_time - start_time, :native, :millisecond)

    # Print result to terminal
    test_name = "#{inspect(test.module)} #{test.name}"

    status =
      case test.state do
        {:failed, _} -> IO.ANSI.red() <> "FAILED" <> IO.ANSI.reset()
        _ -> IO.ANSI.green() <> "PASSED" <> IO.ANSI.reset()
      end

    # Only print for completed tests to avoid interference with ExUnit's own output
    if test.state != nil do
      IO.puts("#{test_name} (#{duration_ms}ms): #{status}")
    end

    # Mark test as finished
    error =
      case test.state do
        {:failed, [{:error, error, stacktrace}]} ->
          file_link =
            case stacktrace do
              [{_mod, _func, _arity, [file: file, line: line]}] ->
                "<a href=\"cursor://file/#{Path.absname(file)}:#{line}\">#{file}:#{line}</a>"

              _ ->
                ""
            end

          message =
            case error do
              %ExUnit.AssertionError{left: left, right: right} when left != :ex_unit_no_meaningful_value ->
                "Expected to be equal:<br>Left: #{inspect(left)}<br>Right: #{inspect(right)}"

              %{message: message} when is_binary(message) ->
                message

              error ->
                inspect(error)
            end

          file_link <> "<br>" <> message

        {:failed, _} ->
          "Unknown error"

        _ ->
          nil
      end

    new_state =
      update_in(
        state,
        [:tests, test_key],
        &(&1
          |> Map.put(:finished, true)
          |> Map.put(:error, error)
          |> Map.put(:duration_ms, duration_ms))
      )

    {:noreply, new_state}
  end

  @impl true
  def handle_cast({:mark_rescue_used, module}, state) do
    # Find the currently running test for the given module
    test_key =
      Enum.find_value(state.tests, fn
        {{^module, test_name}, %{started: true, finished: false}} -> {module, test_name}
        _ -> nil
      end)

    case test_key do
      nil ->
        # No running test found for this module
        Logger.error(
          "Attempted to mark rescue used for module #{inspect(module)}, but no test is currently running for it."
        )

        {:noreply, state}

      test_key ->
        # Found the running test, mark it as having used a rescue block
        new_state = put_in(state, [:tests, test_key, :used_rescue], true)
        {:noreply, new_state}
    end
  end

  @impl true
  def handle_cast({:llm_generation, module, request, response}, state) do
    # Find the currently running test for the given module
    test_key =
      Enum.find_value(state.tests, fn
        {{^module, test_name}, %{started: true, finished: false}} -> {module, test_name}
        _ -> nil
      end)

    case test_key do
      nil ->
        # No running test found for this module
        Logger.error(
          "Attempted to report LLM call for module #{inspect(module)}, but no test is currently running for it."
        )

        {:noreply, state}

      test_key ->
        # Found the running test, append the LLM call
        llm_call_data = %{request: request, response: response, timestamp: DateTime.utc_now()}
        new_state = update_in(state, [:tests, test_key, :llm_calls], fn calls -> [llm_call_data | calls] end)
        {:noreply, new_state}
    end
  end

  # catch all other messages
  def handle_cast(_, state), do: {:noreply, state}

  # Private functions

  defp save_html_report(state) do
    # Create reports directories if they don't exist
    base_reports_dir = "reports"
    llm_calls_dir = Path.join(base_reports_dir, "llm_calls")
    File.mkdir_p!(base_reports_dir)
    File.mkdir_p!(llm_calls_dir)

    # Generate individual LLM call reports and collect links
    tests_with_links =
      Map.new(state.tests, fn {{module, name} = test_key, test_data} ->
        if Enum.empty?(test_data.llm_calls) do
          {test_key, test_data}
        else
          # Generate a unique, filesystem-safe filename
          sanitized_module = module |> Module.split() |> Enum.join("_")
          sanitized_name = name |> Atom.to_string() |> String.replace(~r/[^\w\-]+/, "_")
          report_filename = "test_#{sanitized_module}_#{sanitized_name}.html"
          report_filepath = Path.join(llm_calls_dir, report_filename)
          # Link relative to main report
          relative_link = Path.join("llm_calls", report_filename)

          # Generate and save individual report
          llm_html = Postline.TestFormatter.format_llm_calls_html(module, name, test_data.llm_calls)
          File.write!(report_filepath, llm_html)

          # Add link to test data for the main formatter
          updated_test_data = Map.put(test_data, :llm_report_link, relative_link)
          {test_key, updated_test_data}
        end
      end)

    # Prepare state with updated test data (including links)
    updated_state = %{state | tests: tests_with_links}

    # Generate main report filename
    main_report_filename = Path.join(base_reports_dir, "test_report.html")

    # Generate main HTML content using updated state
    main_html_content = Postline.TestFormatter.format_html(updated_state)

    # Write main report to file
    File.write!(main_report_filename, main_html_content)

    IO.puts("Test report saved to file://#{Path.absname(main_report_filename)}")
    # Note: Individual LLM reports are in #{llm_calls_dir}
  end
 end
	defmodule Postline.IntegrationCase do
	@moduledoc """
	This module defines the setup for tests requiring
	access to real external services like OpenAI.

	Integration tests are meant to test actual integration with
	third-party services and are not run by default with `mix test`.

	Use `mix test.integration` to run integration tests.

	These tests can incur costs as they make real API calls!
	"""

	use ExUnit.CaseTemplate

	alias Ecto.Adapters.SQL.Sandbox

	using do
	quote do
	# Import all the same test helpers as DataCase
	# ...
	import ExUnit.CaptureLog
	import Postline.IntegrationCase

	# Tag all integration tests
	@moduletag :integration

	def chat_completion(prompt_module, opts) do
	Postline.IntegrationCase.chat_completion(__MODULE__, prompt_module, opts)
	end
	end
	end


	setup tags do
	# Skip integration tests unless INTEGRATION_TEST=true
	# DataCase has a reverse condition
	if Application.get_env(:postline, Postline.IntegrationCase)[:enabled] != true do
	raise "INTEGRATION_TEST can't be run unless INTEGRATION_TEST=true"
	end

	Postline.IntegrationCase.setup_sandbox(tags)
	:ok
	end

	@doc """
	Sets up the sandbox based on the test tags.
	"""
	def setup_sandbox(tags) do
	pid = Sandbox.start_owner!(Postline.Repo, shared: not tags[:async])
	on_exit(fn -> Sandbox.stop_owner(pid) end)
	end

	def llm_judge(test_module, question, {:ok, result}), do: llm_judge(test_module, question, result)

	def llm_judge(test_module, question, %{"choices" => [%{"message" => msg}]}),
	do: llm_judge(test_module, question, Jason.encode!(msg, pretty: true))

	def llm_judge(test_module, question, output) do
	inputs = %{question: question, output: output}

	with {:ok, %{"choices" => [%{"message" => %{"content" => content}}]}} <-
	chat_completion(test_module, Postline.Prompts.Library.LLMJudge, inputs: inputs),
	{:ok, %{"pass" => pass, "reason" => reason, "score" => score}} <- Jason.decode(content) do
	%{pass: pass, reason: reason, score: score}
	else
	{:error, error} -> throw("Invalid return value from LLM judge: #{inspect(error)}")
	error -> throw("Invalid return value from LLM judge: #{inspect(error)}")
	end
	end


	@doc """
	In Postline, prompts are module-based.

	Each such prompt has a format method, which takes inputs and some optional configuration
	and returns a request to be sent to the LLM.
	We're using OpenRouter under the hood. This is something you'll want to modify.
	"""
	def chat_completion(test_module, prompt_module, opts) do
	# put your default templating inputs here
	default_inputs = %{}
	opts = Keyword.update(opts, :inputs, default_inputs, &Map.merge(default_inputs, &1))

	# format prompt
	with {:ok, request} <- prompt_module.format(opts),
	{:ok, response} <- Postline.OpenRouter.chat_completion(request) do
	Postline.TestReporter.report_llm_call(test_module, request, response)
	{:ok, response}
	end
	end

	@doc """
	Asserts that an LLM response passes a specific evaluation criteria.

	## Examples

	# Simple assertion
	assert_llm "Is it polite?", result

	# With minimum score requirement
	assert_llm "Is it polite?", result, min_score: 0.8

	# With custom error message
	assert_llm "Is it polite?", result, message: "Response must be polite"

	## Options

	* `:min_score` - Minimum score required to pass (default: 0.5)
	* `:message` - Custom error message to display on failure
	"""
	defmacro assert_llm(question, output, opts \\ []) do
	min_score = Keyword.get(opts, :min_score, 0.5)
	custom_message = Keyword.get(opts, :message, nil)

	quote do
	result = llm_judge(__MODULE__, unquote(question), unquote(output))
	question = unquote(question)
	min_score = unquote(min_score)
	custom_message = unquote(custom_message)

	if !(result.pass && result.score >= min_score) do
	message = """
	LLM response failed evaluation:
	Question: #{question}
	Output: #{inspect(unquote(output))}
	Pass: #{result.pass}
	Score: #{result.score} (minimum: #{min_score})
	Reason: #{result.reason}
	"""

	message = if custom_message, do: message <> "\n\n#{inspect(custom_message)}", else: message

	flunk(message)
	end

	# Return the result for potential further assertions
	result
	end
	end
	end
	defmodule Postline.Prompts.Library.LLMJudge do
	@moduledoc false

	# Import the SigilLiquid module to use the ~SYSTEM, ~USER, ~LIQUID sigils
	use Postline.Prompts.Prompt

	@response_format %{
	"type" => "json_schema",
	"json_schema" => %{
	"name" => "results",
	"schema" => %{
	"type" => "object",
	"required" => ["reason", "score", "pass"],
	"properties" => %{
	"reason" => %{
	"type" => "string",
	"description" => "Analysis of the rubric and the output"
	},
	"score" => %{
	"type" => "number",
	"description" => "Score between 0.0 and 1.0"
	},
	"pass" => %{
	"type" => "boolean",
	"description" => "Whether the output passes the evaluation"
	}
	}
	}
	}
	}

	@doc """
	Returns the parameters for the prompt.

	Valid parameters are:
	- model: The model to use for the prompt.
	- temperature: The temperature to use for the prompt.
	- top_p: The top_p to use for the prompt.
	- top_k: The top_k to use for the prompt.
	- max_tokens: The max_tokens to use for the prompt.

	For full list of parameters, see the OpenRouter API documentation:
	https://openrouter.ai/docs/api-reference/parameters
	"""
	def params do
	%{model: "google/gemini-2.0-flash-001", temperature: 0.0, response_format: @response_format}
	end

	@doc """
	Returns the messages for the prompt.
	"""
	def messages do
	[
	~SYSTEM"""
	You are an expert judge tasked with evaluating the quality of LLM-generated outputs against specific criteria.

	Your job is to analyze the provided output and determine if it meets the requirements specified in the rubric. You must be fair, consistent, and thorough in your evaluation.

	For each evaluation, you will:
	1. Carefully analyze the output against the provided rubric
	2. Provide a detailed explanation of your reasoning in the "reason" field
	3. Assign a score between 0.0 (completely fails) and 1.0 (perfectly meets criteria)
	4. Make a clear pass/fail determination

	Below you will find:
	1. The rubric with criteria for evaluation
	2. The output that needs to be evaluated

	RUBRIC:
	{{ question }}

	OUTPUT TO EVALUATE:
	```
	{{ output }}
	```

	Your response must be structured as a JSON object with the following fields:
	- "reason": A detailed analysis explaining how the output meets or fails to meet the criteria
	- "score": A number between 0.0 and 1.0 representing the quality of the output
	- "pass": A boolean (true/false) indicating whether the output passes the evaluation

	Be objective and focus solely on the criteria provided in the rubric. Do not introduce your own criteria or biases.
	"""
	]
	end

	@doc """
	Returns the tools for the prompt.
	"""
	def tools do
	[]
	end
	end
	defmodule Postline.TestFormatter do
	@moduledoc """
	Formats test stats as HTML for static report generation.
	"""

	@doc """
	Converts test state into formatted HTML.
	"""
	def format_html(state) do
	# Calculate summary statistics
	total_tests = Enum.count(state.tests)

	completed_tests =
	Enum.count(state.tests, fn {_key, test_data} ->
	test_data.started && test_data.finished && test_data.error == nil
	end)

	partial_tests =
	Enum.count(state.tests, fn {_key, test_data} ->
	test_data.started && test_data.finished && test_data.error == nil && test_data.used_rescue
	end)

	failed_tests =
	Enum.count(state.tests, fn {_key, test_data} ->
	test_data.error != nil
	end)

	# All completed tests are passed (including partials)
	passed_tests = completed_tests
	# Fully passed tests are those without rescue
	_fully_passed_tests = passed_tests - partial_tests

	# Calculate duration statistics
	durations =
	Enum.flat_map(state.tests, fn {_key, test_data} ->
	case test_data[:duration_ms] do
	nil -> []
	duration -> [duration]
	end
	end)

	avg_duration =
	if Enum.empty?(durations) do
	0
	else
	round(Enum.sum(durations) / length(durations))
	end

	max_duration = if Enum.empty?(durations), do: 0, else: Enum.max(durations)

	passed_percentage = if total_tests > 0, do: Float.round(passed_tests / total_tests * 100, 1), else: 0.0
	partial_percentage = if total_tests > 0, do: Float.round(partial_tests / total_tests * 100, 1), else: 0.0
	failed_percentage = if total_tests > 0, do: Float.round(failed_tests / total_tests * 100, 1), else: 0.0

	"""
	<!DOCTYPE html>
	<html>
	<head>
	<meta charset="UTF-8">
	<meta name="viewport" content="width=device-width, initial-scale=1.0">
	<title>Postline Test Report</title>
	<style>
	body {
	font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif;
	line-height: 1.6;
	color: #333;
	max-width: 1200px;
	margin: 0 auto;
	padding: 20px;
	background-color: #f8f9fa;
	}
	h1, h2, h3 {
	color: #2d3748;
	}
	.header {
	margin-bottom: 30px;
	}
	.timestamp {
	color: #718096;
	font-size: 14px;
	margin-top: 5px;
	margin-bottom: 4px;
	}
	.duration-stats {
	color: #718096;
	font-size: 14px;
	margin-bottom: 2px;
	}
	.duration-stats .value {
	font-weight: 600;
	color: #4a5568;
	}
	.stats-container {
	display: grid;
	grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
	gap: 20px;
	margin: 25px 0;
	}
	.stat-card {
	background-color: white;
	border-radius: 8px;
	padding: 15px 20px;
	box-shadow: 0 1px 3px rgba(0,0,0,0.1);
	cursor: pointer;
	position: relative;
	transition: transform 0.1s ease-in-out;
	}
	.stat-card:hover {
	transform: translateY(-2px);
	box-shadow: 0 2px 5px rgba(0,0,0,0.15);
	}
	.stat-card.active {
	box-shadow: 0 0 0 2px #4299e1;
	}
	.stat-card .filter-text {
	position: absolute;
	top: 5px;
	right: 10px;
	font-size: 11px;
	color: #718096;
	font-weight: 500;
	display: none;
	}
	.stat-card.active .filter-text {
	display: block;
	color: #4299e1;
	}
	.stat-card.passed {
	border-left: 5px solid #38a169;
	}
	.stat-card.partial {
	border-left: 5px solid #d69e2e;
	}
	.stat-card.failed {
	border-left: 5px solid #e53e3e;
	}
	.stat-title {
	font-size: 14px;
	color: #718096;
	margin-bottom: 5px;
	}
	.stat-value {
	font-size: 22px;
	font-weight: 600;
	margin-bottom: 3px;
	}
	.stat-percentage {
	font-size: 14px;
	color: #718096;
	}
	table {
	border-collapse: collapse;
	width: 100%;
	margin: 20px 0;
	background-color: white;
	border-radius: 8px;
	overflow: hidden;
	box-shadow: 0 1px 3px rgba(0,0,0,0.1);
	table-layout: fixed;
	}
	th, td {
	padding: 12px 16px;
	border-bottom: 1px solid #e2e8f0;
	vertical-align: middle;
	white-space: nowrap;
	}
	th {
	background-color: #f7fafc;
	font-weight: 600;
	color: #4a5568;
	}
	/* Column sizing */
	th:first-child, td:first-child {
	width: 70%;
	text-align: left;
	white-space: normal;
	overflow: hidden;
	text-overflow: ellipsis;
	}
	th:nth-child(2), td:nth-child(2) {
	width: 10%;
	text-align: right;
	}
	th:nth-child(3), td:nth-child(3) {
	width: 10%;
	text-align: center;
	}
	th:nth-child(4), td:nth-child(4) {
	width: 10%;
	text-align: right;
	}
	tr:last-child td {
	border-bottom: none;
	}
	tr:hover {
	background-color: #f7fafc;
	}
	.badge {
	display: inline-block;
	padding: 4px 8px;
	border-radius: 4px;
	font-size: 12px;
	font-weight: 600;
	}
	.badge-success {
	background-color: #c6f6d5;
	color: #22543d;
	}
	.badge-partial {
	background-color: #fefcbf;
	color: #744210;
	}
	.badge-pending {
	background-color: #e9d8fd;
	color: #553c9a;
	}
	.badge-error {
	background-color: #fed7d7;
	color: #9b2c2c;
	}
	.badge-duration-fast {
	background-color: #c6f6d5;
	color: #22543d;
	}
	.badge-duration-medium {
	background-color: #fefcbf;
	color: #744210;
	}
	.badge-duration-slow {
	background-color: #fed7d7;
	color: #9b2c2c;
	}
	.badge-duration-very-slow {
	background-color: #2d3748;
	color: #ffffff;
	}
	a {
	color: #4299e1;
	text-decoration: none;
	}
	a:hover {
	text-decoration: underline;
	}
	.filter-controls {
	display: flex;
	justify-content: flex-end;
	margin-bottom: 10px;
	align-items: center;
	}
	.filter-controls label {
	margin-right: 8px;
	font-size: 14px;
	color: #4a5568;
	}
	.filter-controls button {
	background-color: #edf2f7;
	border: none;
	padding: 6px 12px;
	margin-left: 5px;
	border-radius: 4px;
	font-size: 12px;
	font-weight: 500;
	color: #4a5568;
	cursor: pointer;
	transition: all 0.2s;
	}
	.filter-controls button:hover {
	background-color: #e2e8f0;
	}
	.filter-controls button.active {
	background-color: #4299e1;
	color: white;
	}
	tr.hide {
	display: none;
	}
	.text-right {
	text-align: right;
	}
	.text-center {
	text-align: center;
	}
	</style>
	</head>
	<body>
	<div class="header">
	<h1>Postline Test Report</h1>
	<div class="duration-stats">Time: <span class="value">#{format_berlin_time()}</span></div>
	<div class="duration-stats">Average: <span class="value">#{avg_duration}ms</span></div>
	<div class="duration-stats">Longest: <span class="value">#{max_duration}ms</span></div>
	</div>

	<div class="stats-container">
	<div class="stat-card passed" data-filter="success" onclick="filterTests('success')">
	<span class="filter-text">Show Only</span>
	<div class="stat-title">Passed</div>
	<div class="stat-value">#{passed_tests}/#{total_tests}</div>
	<div class="stat-percentage">#{passed_percentage}%</div>
	</div>
	<div class="stat-card partial" data-filter="partial" onclick="filterTests('partial')">
	<span class="filter-text">Show Only</span>
	<div class="stat-title">Partial</div>
	<div class="stat-value">#{partial_tests}/#{total_tests}</div>
	<div class="stat-percentage">#{partial_percentage}%</div>
	</div>
	<div class="stat-card failed" data-filter="error" onclick="filterTests('error')">
	<span class="filter-text">Show Only</span>
	<div class="stat-title">Failed</div>
	<div class="stat-value">#{failed_tests}/#{total_tests}</div>
	<div class="stat-percentage">#{failed_percentage}%</div>
	</div>
	</div>

	<div class="filter-controls">
	<label>Filter:</label>
	<button id="filter-all" class="active" onclick="filterTests('all')">All Tests</button>
	<button id="filter-success" onclick="filterTests('success')">Passed</button>
	<button id="filter-partial" onclick="filterTests('partial')">Partial</button>
	<button id="filter-error" onclick="filterTests('error')">Failed</button>
	</div>

	<table>
	<thead>
	<tr>
	<th>Test</th>
	<th>Status</th>
	<th class="text-center">Duration</th>
	<th>View</th>
	</tr>
	</thead>
	<tbody>
	#{render_tests(state.tests)}
	</tbody>
	</table>

	<script>
	function filterTests(filterType) {
	// Reset all filters
	document.querySelectorAll('.filter-controls button').forEach(btn => {
	btn.classList.remove('active');
	});
	document.querySelectorAll('.stat-card').forEach(card => {
	card.classList.remove('active');
	});

	// Set the active button
	document.getElementById('filter-' + filterType).classList.add('active');

	// If it's not 'all', set the active card
	if (filterType !== 'all') {
	document.querySelector('.stat-card[data-filter="' + filterType + '"]').classList.add('active');
	}

	// Filter the table rows
	const rows = document.querySelectorAll('tbody tr');

	rows.forEach(row => {
	const statusBadge = row.querySelector('.badge');
	if (!statusBadge) return; // Skip error rows

	if (filterType === 'all') {
	row.classList.remove('hide');
	} else {
	// Check if the badge matches the filter
	const hasClass = statusBadge.classList.contains('badge-' + filterType);
	if (hasClass) {
	row.classList.remove('hide');
	} else {
	row.classList.add('hide');
	}
	}
	});
	}

	// Initialize with all tests showing
	window.onload = function() {
	filterTests('all');
	};
	</script>
	</body>
	</html>
	"""
	end

	defp render_tests(tests) do
	Enum.map_join(tests, "", fn {{_module, name}, test_data} ->
	# Remove "test " from the beginning of the test name
	# First, ensure name is a string before trying to replace
	display_name =
	name
	\|> to_string()
	\|> String.replace(~r/^test /, "")

	status_badge =
	case test_data do
	%{error: error} when error != nil ->
	~s(<span class="badge badge-error">Error</span>)

	%{started: true, finished: true, used_rescue: true} ->
	~s(<span class="badge badge-partial">Partial</span>)

	%{started: true, finished: true} ->
	~s(<span class="badge badge-success">Passed</span>)

	%{started: true, finished: false} ->
	~s(<span class="badge badge-pending">Running</span>)
	end

	duration_badge = create_duration_badge(test_data[:duration_ms])

	llm_link =
	case test_data[:llm_report_link] do
	nil ->
	""

	link ->
	llm_count = Enum.count(test_data.llm_calls)
	~s(<a href="#{link}">View \(#{llm_count}\)</a>)
	end

	error_row =
	case test_data.error do
	nil ->
	""

	error ->
	"""
	<tr>
	<td colspan="4" class="badge-error" style="white-space: pre-wrap;">#{error}</td>
	</tr>
	"""
	end

	"""
	<tr>
	<td>#{display_name}</td>
	<td>#{status_badge}</td>
	<td class="text-center">#{duration_badge}</td>
	<td>#{llm_link}</td>
	</tr>
	#{error_row}
	"""
	end)
	end

	defp create_duration_badge(nil), do: "N/A"

	defp create_duration_badge(duration_ms) do
	{badge_class, badge_text} =
	cond do
	duration_ms <= 3000 -> {"badge-duration-fast", "#{duration_ms}ms"}
	duration_ms <= 5000 -> {"badge-duration-medium", "#{duration_ms}ms"}
	duration_ms <= 8000 -> {"badge-duration-slow", "#{duration_ms}ms"}
	true -> {"badge-duration-very-slow", "#{duration_ms}ms"}
	end

	~s(<span class="badge #{badge_class}">#{badge_text}</span>)
	end

	defp format_berlin_time do
	# Get current UTC time
	utc_now = DateTime.utc_now()

	# Determine if we're in DST (roughly - actual calculation is more complex)
	# This is a simple approximation for Berlin time
	month = utc_now.month

	# CEST (+2 hours)
	dst_offset =
	if month in 3..10 do
	2
	else
	# CET (+1 hour)
	1
	end

	# Add the Berlin offset to UTC
	berlin_time = DateTime.add(utc_now, dst_offset, :hour)

	# Format as HH:MM
	Calendar.strftime(berlin_time, "%H:%M")
	end

	@doc """
	Generates HTML content for a single test's LLM calls.
	"""
	def format_llm_calls_html(module, name, llm_calls) do
	# Reverse calls to show oldest first
	calls_content =
	llm_calls
	\|> Enum.reverse()
	\|> Enum.with_index(1)
	\|> Enum.map_join(&render_single_llm_call/1)

	"""
	<!DOCTYPE html>
	<html>
	<head>
	<meta charset="UTF-8">
	<meta name="viewport" content="width=device-width, initial-scale=1.0">
	<title>LLM Calls - #{inspect(module)} - #{inspect(name)}</title>
	<style>
	body {
	font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif;
	line-height: 1.6;
	color: #333;
	max-width: 1200px;
	margin: 0 auto;
	padding: 20px;
	}
	h1, h2 {
	color: #2d3748;
	}
	.llm-call {
	border: 1px solid #e2e8f0;
	border-radius: 8px;
	margin-bottom: 20px;
	background-color: #fff; /* Change background */
	}
	.llm-call-header {
	background-color: #edf2f7;
	padding: 10px 15px;
	font-weight: 600;
	border-bottom: 1px solid #e2e8f0;
	border-top-left-radius: 8px;
	border-top-right-radius: 8px;
	display: flex; /* Use flex for alignment */
	justify-content: space-between; /* Space out content */
	align-items: center; /* Center items vertically */
	}
	.llm-call-body {
	padding: 0 15px 15px 15px; /* Adjust padding */
	}
	.llm-request-section {
	/* Hide request by default */
	/* max-height: 0;
	overflow: hidden; */
	/* transition: max-height 0.3s ease-out; */
	/* Let's keep it visible initially and make it collapsible */
	padding-top: 15px;
	border-top: 1px dashed #e2e8f0;
	margin-top: 15px;
	}
	/* Style for when request is hidden - Add class via JS */
	.llm-request-section.hidden {
	display: none;
	}
	.message {
	border: 1px solid #cbd5e0;
	border-radius: 6px;
	margin-bottom: 10px;
	background-color: #f7fafc; /* Background for messages */
	}
	.message-header {
	background-color: #e2e8f0;
	padding: 5px 10px;
	font-weight: 500;
	text-transform: capitalize;
	border-bottom: 1px solid #cbd5e0;
	border-top-left-radius: 6px;
	border-top-right-radius: 6px;
	}
	.message-content {
	padding: 10px;
	white-space: pre-wrap; /* Preserve whitespace and wrap */
	word-wrap: break-word;
	}
	.reasoning-content {
	padding: 10px;
	white-space: pre-wrap;
	word-wrap: break-word;
	font-size: 0.85em; /* Smaller font for reasoning */
	color: #4a5568; /* Slightly darker text for reasoning */
	background-color: #f0f4f8; /* Slightly different background */
	border-bottom: 1px solid #e2e8f0;
	}
	.tool-call {
	border: 1px dashed #a0aec0;
	border-radius: 4px;
	padding: 10px;
	margin: 10px;
	background-color: #f0f4f8;
	}
	pre {
	background-color: #2d3748;
	color: #f7fafc;
	padding: 15px;
	border-radius: 6px;
	overflow-x: auto;
	}
	code {
	font-family: "SFMono-Regular", Consolas, "Liberation Mono", Menlo, Courier, monospace;
	}
	.toggle-button {
	background-color: #4a5568;
	color: white;
	border: none;
	padding: 5px 10px;
	border-radius: 4px;
	cursor: pointer;
	font-size: 12px;
	}
	.toggle-button:hover {
	background-color: #2d3748;
	}
	</style>
	</head>
	<body>
	<h1>LLM Calls</h1>
	<h2>#{module}</h2>
	<h3>#{name}</h3>
	<a href="../test_report.html">← Back to Main Report</a>
	#{calls_content}

	<script>
	document.querySelectorAll('.toggle-button').forEach(button => {
	button.addEventListener('click', function() {
	const callIndex = this.getAttribute('data-call-index');
	const requestSection = document.getElementById(`request-${callIndex}`);

	if (requestSection) {
	requestSection.classList.toggle('hidden');
	// Change button text
	if (requestSection.classList.contains('hidden')) {
	this.textContent = 'Show Request';
	} else {
	this.textContent = 'Hide Request';
	}
	}
	});
	});
	</script>
	</body>
	</html>
	"""
	end

	defp render_single_llm_call({llm_call, index}) do
	request_content = format_request(llm_call.request)
	response_content = format_response(llm_call.response)
	timestamp_str = llm_call.timestamp \|> DateTime.truncate(:second) \|> DateTime.to_string()
	request_section_id = "request-#{index}"

	"""
	<div class="llm-call">
	<div class="llm-call-header">
	<span>LLM Call ##{index} (#{timestamp_str})</span>
	<button class="toggle-button" data-call-index="#{index}">Show Request</button>
	</div>
	<div class="llm-call-body">
	<div id="#{request_section_id}" class="llm-request-section hidden">
	<h3>Request</h3>
	#{request_content}
	</div>
	<div>
	<h3>Response</h3>
	#{response_content}
	</div>
	</div>
	</div>
	"""
	end

	defp format_request(request) do
	# Assuming request has a :messages key similar to OpenAI
	messages = request[:messages] \|\| request["messages"] \|\| []
	Enum.map_join(messages, &format_message/1)
	end

	defp format_response(response) do
	# Assuming response format similar to OpenAI ChatCompletion
	choices = response[:choices] \|\| response["choices"] \|\| []

	Enum.map_join(choices, fn choice ->
	message = choice[:message] \|\| choice["message"]
	finish_reason = choice[:finish_reason] \|\| choice["finish_reason"]

	message_html = format_message(message)

	"""
	#{message_html}
	<p><em>Finish Reason: #{finish_reason \|\| "N/A"}</em></p>
	"""
	end)
	end

	defp format_message(message) when is_map(message) do
	role = Map.get(message, "role")
	content = Map.get(message, "content")
	reasoning = Map.get(message, "reasoning")
	tool_calls = Map.get(message, "tool_calls")

	content_html = ""

	# Add reasoning with different styling if present
	content_html =
	if is_binary(reasoning) and String.trim(reasoning) != "" do
	content_html <> ~s(<div class="reasoning-content">#{Plug.HTML.html_escape(reasoning)}</div>)
	else
	content_html
	end

	# Add content with regular styling if present
	content_html =
	if is_binary(content) and String.trim(content) != "" do
	content_html <> ~s(<div class="message-content">#{Plug.HTML.html_escape(content)}</div>)
	else
	content_html <> ~s(<div class="message-content"><em>No text content</em></div>)
	end

	content_html =
	if is_list(tool_calls) and not Enum.empty?(tool_calls) do
	content_html <> format_tool_calls(tool_calls)
	else
	content_html
	end

	"""
	<div class="message">
	<div class="message-header">Role: #{role}</div>
	#{content_html}
	</div>
	"""
	end

	defp format_message(_), do: "<p><em>Invalid message format</em></p>"

	defp format_tool_calls(tool_calls) when is_list(tool_calls) do
	Enum.map_join(tool_calls, fn tool_call ->
	# Use Map.get with atom/string flexibility
	id = Map.get(tool_call, "id")
	type = Map.get(tool_call, "type")
	function_map = Map.get(tool_call, "function")
	name = Map.get(function_map, "name")
	arguments = Map.get(function_map, "arguments")

	# Attempt to pretty-print arguments if they look like JSON
	formatted_args =
	case Jason.decode(arguments) do
	{:ok, decoded_args} ->
	decoded_args
	\|> Jason.encode!(pretty: true)
	\|> Plug.HTML.html_escape()

	_ ->
	Plug.HTML.html_escape(arguments)
	end

	"""
	<div class="tool-call">
	<strong>Tool Call ID:</strong> #{id}<br>
	<strong>Type:</strong> #{type}<br>
	<strong>Function Name:</strong> #{name}<br>
	<strong>Arguments:</strong>
	<pre><code>#{formatted_args}</code></pre>
	</div>
	"""
	end)
	end

	defp format_tool_calls(_), do: "<em>Invalid tool calls format</em>"
	end
	integration_test_config = Application.get_env(:postline, Postline.IntegrationCase)
	opts =
	if integration_test_config[:enabled],
	do: [formatters: [ExUnit.CLIFormatter, Postline.TestReporter]],
	else: []

	ExUnit.start(opts)
	Ecto.Adapters.SQL.Sandbox.mode(Postline.Repo, :manual)
	defmodule Postline.TestReporter do
	@moduledoc false
	use GenServer

	require Logger

	# Client API

	def start_link(opts \\ []) do
	GenServer.start_link(__MODULE__, opts, name: __MODULE__)
	end

	@doc """
	Reports an LLM generation call made during a test.
	Determines the current test based on the provided module.
	Requires the module, the request, and the response.
	"""
	def report_llm_call(module, request, response) do
	GenServer.cast(__MODULE__, {:llm_generation, module, request, response})
	end

	@doc """
	Marks the current test as having used a rescue block. This will be used
	in the test reporter to show the test as "partial" rather than "passed".
	"""
	def mark_test_rescue_used(module) do
	GenServer.cast(__MODULE__, {:mark_rescue_used, module})
	end

	# Server Callbacks

	@impl true
	def init(_opts) do
	state = %{
	# Structure: {module, name} => %{started: bool, finished: bool, llm_calls: list}
	tests: %{}
	}

	Process.register(self(), __MODULE__)
	{:ok, state}
	end

	@impl true
	def handle_cast({:suite_finished, _times_used}, state) do
	# Generate HTML report and save it to a file
	save_html_report(state)

	{:noreply, state}
	end

	@impl true
	def handle_cast({event, %{state: {:excluded, _reason}}}, state) when event in [:test_started, :test_finished] do
	{:noreply, state}
	end

	@impl true
	def handle_cast({:test_started, test}, state) do
	test_key = {test.module, test.name}
	# Initialize test state with current timestamp
	start_time = System.monotonic_time()

	new_state =
	put_in(state, [:tests, test_key], %{
	started: true,
	finished: false,
	error: nil,
	llm_calls: [],
	used_rescue: false,
	start_time: start_time
	})

	{:noreply, new_state}
	end

	@impl true
	def handle_cast({:test_finished, test}, state) do
	test_key = {test.module, test.name}
	end_time = System.monotonic_time()

	# Calculate duration in milliseconds
	start_time = get_in(state, [:tests, test_key, :start_time]) \|\| end_time
	duration_ms = System.convert_time_unit(end_time - start_time, :native, :millisecond)

	# Print result to terminal
	test_name = "#{inspect(test.module)} #{test.name}"

	status =
	case test.state do
	{:failed, _} -> IO.ANSI.red() <> "FAILED" <> IO.ANSI.reset()
	_ -> IO.ANSI.green() <> "PASSED" <> IO.ANSI.reset()
	end

	# Only print for completed tests to avoid interference with ExUnit's own output
	if test.state != nil do
	IO.puts("#{test_name} (#{duration_ms}ms): #{status}")
	end

	# Mark test as finished
	error =
	case test.state do
	{:failed, [{:error, error, stacktrace}]} ->
	file_link =
	case stacktrace do
	[{_mod, _func, _arity, [file: file, line: line]}] ->
	"<a href=\"cursor://file/#{Path.absname(file)}:#{line}\">#{file}:#{line}</a>"

	_ ->
	""
	end

	message =
	case error do
	%ExUnit.AssertionError{left: left, right: right} when left != :ex_unit_no_meaningful_value ->
	"Expected to be equal:<br>Left: #{inspect(left)}<br>Right: #{inspect(right)}"

	%{message: message} when is_binary(message) ->
	message

	error ->
	inspect(error)
	end

	file_link <> "<br>" <> message

	{:failed, _} ->
	"Unknown error"

	_ ->
	nil
	end

	new_state =
	update_in(
	state,
	[:tests, test_key],
	&(&1
	\|> Map.put(:finished, true)
	\|> Map.put(:error, error)
	\|> Map.put(:duration_ms, duration_ms))
	)

	{:noreply, new_state}
	end

	@impl true
	def handle_cast({:mark_rescue_used, module}, state) do
	# Find the currently running test for the given module
	test_key =
	Enum.find_value(state.tests, fn
	{{^module, test_name}, %{started: true, finished: false}} -> {module, test_name}
	_ -> nil
	end)

	case test_key do
	nil ->
	# No running test found for this module
	Logger.error(
	"Attempted to mark rescue used for module #{inspect(module)}, but no test is currently running for it."
	)

	{:noreply, state}

	test_key ->
	# Found the running test, mark it as having used a rescue block
	new_state = put_in(state, [:tests, test_key, :used_rescue], true)
	{:noreply, new_state}
	end
	end

	@impl true
	def handle_cast({:llm_generation, module, request, response}, state) do
	# Find the currently running test for the given module
	test_key =
	Enum.find_value(state.tests, fn
	{{^module, test_name}, %{started: true, finished: false}} -> {module, test_name}
	_ -> nil
	end)

	case test_key do
	nil ->
	# No running test found for this module
	Logger.error(
	"Attempted to report LLM call for module #{inspect(module)}, but no test is currently running for it."
	)

	{:noreply, state}

	test_key ->
	# Found the running test, append the LLM call
	llm_call_data = %{request: request, response: response, timestamp: DateTime.utc_now()}
	new_state = update_in(state, [:tests, test_key, :llm_calls], fn calls -> [llm_call_data \| calls] end)
	{:noreply, new_state}
	end
	end

	# catch all other messages
	def handle_cast(_, state), do: {:noreply, state}

	# Private functions

	defp save_html_report(state) do
	# Create reports directories if they don't exist
	base_reports_dir = "reports"
	llm_calls_dir = Path.join(base_reports_dir, "llm_calls")
	File.mkdir_p!(base_reports_dir)
	File.mkdir_p!(llm_calls_dir)

	# Generate individual LLM call reports and collect links
	tests_with_links =
	Map.new(state.tests, fn {{module, name} = test_key, test_data} ->
	if Enum.empty?(test_data.llm_calls) do
	{test_key, test_data}
	else
	# Generate a unique, filesystem-safe filename
	sanitized_module = module \|> Module.split() \|> Enum.join("_")
	sanitized_name = name \|> Atom.to_string() \|> String.replace(~r/[^\w\-]+/, "_")
	report_filename = "test_#{sanitized_module}_#{sanitized_name}.html"
	report_filepath = Path.join(llm_calls_dir, report_filename)
	# Link relative to main report
	relative_link = Path.join("llm_calls", report_filename)

	# Generate and save individual report
	llm_html = Postline.TestFormatter.format_llm_calls_html(module, name, test_data.llm_calls)
	File.write!(report_filepath, llm_html)

	# Add link to test data for the main formatter
	updated_test_data = Map.put(test_data, :llm_report_link, relative_link)
	{test_key, updated_test_data}
	end
	end)

	# Prepare state with updated test data (including links)
	updated_state = %{state \| tests: tests_with_links}

	# Generate main report filename
	main_report_filename = Path.join(base_reports_dir, "test_report.html")

	# Generate main HTML content using updated state
	main_html_content = Postline.TestFormatter.format_html(updated_state)

	# Write main report to file
	File.write!(main_report_filename, main_html_content)

	IO.puts("Test report saved to file://#{Path.absname(main_report_filename)}")
	# Note: Individual LLM reports are in #{llm_calls_dir}
	end
	end