zakelfassi · April 2, 2026 19:55
diff --git a/holo3-tac-eval.py b/holo3-tac-eval.py
 #!/usr/bin/env python3
 """
 Holo3 TAC UI Eval — test computer-use VLM on TAC surfaces.

 Uses dev-browser (https://github.com/SawyerHood/dev-browser) to connect
 to your running Chrome with the TAC extension installed, take screenshots
 of real extension + SPA surfaces, and send them to Holo3 for evaluation.

 Setup:
  npm install -g dev-browser && dev-browser install
  pip install openai
  # Launch Chrome with remote debugging:
  #   Mac: /Applications/Google\ Chrome.app/Contents/MacOS/Google\ Chrome --remote-debugging-port=9222
  #   Or enable at chrome://inspect/#remote-debugging
  export HAI_API_KEY="your-key-from-portal.hcompany.ai"

 Usage:
  python3 holo3-tac-eval.py [--url https://staging.talkandcomment.com] [--model holo3-35b-a3b]
  python3 holo3-tac-eval.py --tasks locate-record-btn describe-layout
 """

 import argparse
 import base64
 import json
 import os
 import subprocess
 import sys
 import time
 from pathlib import Path

 try:
    from openai import OpenAI
 except ImportError:
    sys.exit("pip install openai")


 # --- Config ---

 EVAL_TASKS = [
    {
        "id": "locate-record-btn",
        "setup": """
            const page = await browser.getPage("tac-eval");
            await page.goto("{base_url}/app", {{ waitUntil: "networkidle", timeout: 15000 }});
            await page.waitForTimeout(2000);
            const buf = await page.screenshot({{ type: "png" }});
            await saveScreenshot(buf, "app-main.png");
            console.log("screenshot:app-main.png");
        """,
        "instruction": "Find the button or element that starts a voice recording. Return its label text and approximate x,y coordinates.",
        "schema": {
            "type": "object",
            "properties": {
                "element_label": {"type": "string", "description": "The visible text or aria-label of the record button"},
                "x": {"type": "integer", "description": "Approximate x coordinate of the element center"},
                "y": {"type": "integer", "description": "Approximate y coordinate of the element center"},
                "confidence": {"type": "string", "enum": ["high", "medium", "low"]},
            },
            "required": ["element_label", "x", "y", "confidence"],
        },
    },
    {
        "id": "identify-plan-badge",
        "setup": """
            const page = await browser.getPage("tac-eval");
            await page.goto("{base_url}/app", {{ waitUntil: "networkidle", timeout: 15000 }});
            await page.waitForTimeout(2000);
            const buf = await page.screenshot({{ type: "png" }});
            await saveScreenshot(buf, "app-plan.png");
            console.log("screenshot:app-plan.png");
        """,
        "instruction": "Look at the user interface and identify what subscription plan the user is on. Find the plan badge or label and return the plan name and its location.",
        "schema": {
            "type": "object",
            "properties": {
                "plan_name": {"type": "string", "description": "The subscription plan shown (e.g. Free, Trial, Teacher Pro)"},
                "badge_text": {"type": "string", "description": "Exact text of the badge element"},
                "x": {"type": "integer"},
                "y": {"type": "integer"},
                "confidence": {"type": "string", "enum": ["high", "medium", "low"]},
            },
            "required": ["plan_name", "confidence"],
        },
    },
    {
        "id": "find-upgrade-cta",
        "setup": """
            const page = await browser.getPage("tac-eval");
            await page.goto("{base_url}/pricing", {{ waitUntil: "networkidle", timeout: 15000 }});
            await page.waitForTimeout(2000);
            const buf = await page.screenshot({{ type: "png" }});
            await saveScreenshot(buf, "pricing.png");
            console.log("screenshot:pricing.png");
        """,
        "instruction": "Find the primary call-to-action button for upgrading to a paid plan. Return the button text, its coordinates, and what plan it promotes.",
        "schema": {
            "type": "object",
            "properties": {
                "cta_text": {"type": "string", "description": "The text on the upgrade button"},
                "promoted_plan": {"type": "string", "description": "Which plan the CTA promotes"},
                "price_shown": {"type": "string", "description": "Any price visible near the CTA"},
                "x": {"type": "integer"},
                "y": {"type": "integer"},
                "confidence": {"type": "string", "enum": ["high", "medium", "low"]},
            },
            "required": ["cta_text", "confidence"],
        },
    },
    {
        "id": "describe-layout",
        "setup": """
            const page = await browser.getPage("tac-eval");
            await page.goto("{base_url}/app", {{ waitUntil: "networkidle", timeout: 15000 }});
            await page.waitForTimeout(2000);
            const buf = await page.screenshot({{ type: "png" }});
            await saveScreenshot(buf, "app-layout.png");
            console.log("screenshot:app-layout.png");
        """,
        "instruction": "Describe the overall layout of this web application. What are the main sections visible? Is there a sidebar, header, main content area? List the key UI components you can identify.",
        "schema": {
            "type": "object",
            "properties": {
                "layout_type": {"type": "string", "description": "e.g. sidebar+main, single-column, dashboard-grid"},
                "sections": {
                    "type": "array",
                    "items": {"type": "string"},
                    "description": "List of main UI sections visible",
                },
                "key_components": {
                    "type": "array",
                    "items": {"type": "string"},
                    "description": "Notable UI components (buttons, lists, cards, etc.)",
                },
                "overall_purpose": {"type": "string", "description": "What does this app appear to do?"},
            },
            "required": ["sections", "key_components", "overall_purpose"],
        },
    },
    {
        "id": "extension-floater",
        "setup": """
            const page = await browser.getPage("tac-eval");
            await page.goto("{base_url}/app", {{ waitUntil: "networkidle", timeout: 15000 }});
            await page.waitForTimeout(3000);
            // The TAC floater is injected by the Chrome extension as a shadow DOM element.
            // Screenshot the full page — if the extension is loaded, the floater should be visible.
            const buf = await page.screenshot({{ type: "png" }});
            await saveScreenshot(buf, "app-with-ext.png");
            console.log("screenshot:app-with-ext.png");
        """,
        "instruction": "Look for a floating widget or overlay element that appears to be injected by a browser extension (not part of the main page). It may be a small circular button or a recording widget, likely positioned at the edge or corner of the viewport. Describe what you see and its location.",
        "schema": {
            "type": "object",
            "properties": {
                "floater_found": {"type": "boolean", "description": "Whether a floating extension widget was detected"},
                "description": {"type": "string", "description": "What the floater looks like"},
                "position": {"type": "string", "description": "Where on screen (e.g. bottom-right, top-left)"},
                "x": {"type": "integer"},
                "y": {"type": "integer"},
                "confidence": {"type": "string", "enum": ["high", "medium", "low"]},
            },
            "required": ["floater_found", "confidence"],
        },
    },
    {
        "id": "nav-to-settings",
        "setup": """
            const page = await browser.getPage("tac-eval");
            await page.goto("{base_url}/app", {{ waitUntil: "networkidle", timeout: 15000 }});
            await page.waitForTimeout(2000);
            const buf = await page.screenshot({{ type: "png" }});
            await saveScreenshot(buf, "app-nav.png");
            console.log("screenshot:app-nav.png");
        """,
        "instruction": "I want to navigate to the account or settings page. What element should I click? Return the exact text/icon and coordinates of the navigation element.",
        "schema": {
            "type": "object",
            "properties": {
                "element_description": {"type": "string"},
                "element_text": {"type": "string"},
                "x": {"type": "integer"},
                "y": {"type": "integer"},
                "confidence": {"type": "string", "enum": ["high", "medium", "low"]},
            },
            "required": ["element_description", "x", "y", "confidence"],
        },
    },
 ]

 DEV_BROWSER_TMP = Path.home() / ".dev-browser" / "tmp"


 def run_dev_browser(script: str) -> str:
    """Run a script via dev-browser --connect, return stdout."""
    result = subprocess.run(
        ["dev-browser", "--connect"],
        input=script,
        capture_output=True,
        text=True,
        timeout=30,
    )
    if result.returncode != 0:
        raise RuntimeError(f"dev-browser failed: {result.stderr.strip()}")
    return result.stdout.strip()


 def take_screenshot(task: dict, base_url: str, out_dir: Path) -> Path:
    """Run the task's dev-browser setup script, return screenshot path."""
    script = task["setup"].replace("{base_url}", base_url)
    output = run_dev_browser(script)

    # Parse screenshot filename from console.log output
    for line in output.splitlines():
        if line.startswith("screenshot:"):
            fname = line.split(":", 1)[1].strip()
            src = DEV_BROWSER_TMP / fname
            if src.exists():
                dst = out_dir / fname
                dst.write_bytes(src.read_bytes())
                return dst

    raise RuntimeError(f"No screenshot produced. dev-browser output:\n{output}")


 def encode_image(path: Path) -> str:
    return base64.b64encode(path.read_bytes()).decode("utf-8")


 def run_task(client, model: str, task: dict, screenshot_path: Path) -> dict:
    """Send a screenshot + instruction to Holo3, get structured response."""
    b64 = encode_image(screenshot_path)

    messages = [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": task["instruction"]},
                {
                    "type": "image_url",
                    "image_url": {"url": f"data:image/png;base64,{b64}"},
                },
            ],
        }
    ]

    t0 = time.time()
    try:
        response = client.chat.completions.create(
            model=model,
            messages=messages,
            extra_body={"structured_outputs": {"json": task["schema"]}},
            temperature=0.0,
        )
        elapsed = time.time() - t0
        content = response.choices[0].message.content
        usage = {
            "prompt_tokens": response.usage.prompt_tokens if response.usage else None,
            "completion_tokens": response.usage.completion_tokens if response.usage else None,
        }

        try:
            parsed = json.loads(content)
        except json.JSONDecodeError:
            parsed = {"raw": content, "parse_error": True}

        return {
            "task_id": task["id"],
            "status": "ok",
            "result": parsed,
            "raw": content,
            "elapsed_s": round(elapsed, 2),
            "usage": usage,
        }
    except Exception as e:
        return {
            "task_id": task["id"],
            "status": "error",
            "error": str(e),
            "elapsed_s": round(time.time() - t0, 2),
        }


 def check_dev_browser():
    """Verify dev-browser is installed and can connect."""
    try:
        result = subprocess.run(
            ["dev-browser", "--connect"],
            input='const tabs = await browser.listPages(); console.log(JSON.stringify(tabs.length));',
            capture_output=True, text=True, timeout=10,
        )
        if result.returncode != 0:
            print(f"⚠️  dev-browser --connect failed: {result.stderr.strip()}")
            print("   Make sure Chrome is running with --remote-debugging-port=9222")
            print("   Or enable remote debugging at chrome://inspect/#remote-debugging")
            sys.exit(1)
        tab_count = result.stdout.strip()
        print(f"   ✅ Connected to Chrome ({tab_count} tabs)")
    except FileNotFoundError:
        sys.exit("dev-browser not found. Install: npm install -g dev-browser && dev-browser install")


 def main():
    parser = argparse.ArgumentParser(description="Holo3 TAC UI Eval (dev-browser)")
    parser.add_argument("--url", default="https://staging.talkandcomment.com", help="Base URL to test")
    parser.add_argument("--model", default="holo3-35b-a3b", help="Model ID")
    parser.add_argument("--api-key", default=os.environ.get("HAI_API_KEY"), help="H Company API key")
    parser.add_argument("--out", default="./holo3-eval-results", help="Output directory")
    parser.add_argument("--tasks", nargs="*", help="Run specific task IDs only")
    parser.add_argument("--screenshot-only", action="store_true", help="Only take screenshots, skip Holo3 inference")
    args = parser.parse_args()

    if not args.screenshot_only and not args.api_key:
        sys.exit("Set HAI_API_KEY env var or pass --api-key. Get one free at https://portal.hcompany.ai/")

    out_dir = Path(args.out)
    out_dir.mkdir(parents=True, exist_ok=True)
    screenshots_dir = out_dir / "screenshots"
    screenshots_dir.mkdir(exist_ok=True)

    tasks = EVAL_TASKS
    if args.tasks:
        tasks = [t for t in tasks if t["id"] in args.tasks]

    print(f"\n🔍 Holo3 TAC UI Eval (dev-browser)")
    print(f"   Target:  {args.url}")
    print(f"   Model:   {args.model}")
    print(f"   Tasks:   {len(tasks)}")
    print(f"   Output:  {out_dir}")
    print(f"\n🔌 Checking dev-browser connection...")
    check_dev_browser()

    # Phase 1: Screenshots via dev-browser --connect
    print(f"\n📷 Phase 1: Capturing screenshots from live Chrome...")
    screenshot_map = {}
    for task in tasks:
        print(f"  📸 {task['id']}...")
        try:
            screenshot_map[task["id"]] = take_screenshot(task, args.url, screenshots_dir)
            print(f"     ✅ {screenshot_map[task['id']]}")
        except Exception as e:
            print(f"     ❌ {e}")
            screenshot_map[task["id"]] = None

    if args.screenshot_only:
        print(f"\n📸 Screenshots saved to {screenshots_dir}/")
        return

    # Phase 2: Run Holo3 inference
    client = OpenAI(base_url="https://api.hcompany.ai/v1/", api_key=args.api_key)

    print(f"\n🤖 Phase 2: Running Holo3 eval tasks...")
    results = []
    for i, task in enumerate(tasks, 1):
        ss = screenshot_map.get(task["id"])
        if not ss:
            results.append({"task_id": task["id"], "status": "skipped", "error": "no screenshot"})
            continue

        print(f"\n  [{i}/{len(tasks)}] {task['id']}")
        print(f"       Instruction: {task['instruction'][:80]}...")
        result = run_task(client, args.model, task, ss)
        results.append(result)

        if result["status"] == "ok":
            print(f"       ✅ {result['elapsed_s']}s | tokens: {result['usage']}")
            for k, v in result["result"].items():
                val = str(v)
                if len(val) > 100:
                    val = val[:100] + "..."
                print(f"          {k}: {val}")
        else:
            print(f"       ❌ {result['error']}")

        # Respect free tier rate limit (10 RPM)
        if i < len(tasks):
            time.sleep(7)

    # Phase 3: Summary
    print("\n" + "=" * 60)
    print("📊 EVAL SUMMARY")
    print("=" * 60)

    ok = [r for r in results if r["status"] == "ok"]
    err = [r for r in results if r["status"] in ("error", "skipped")]
    total_tokens_in = sum(r.get("usage", {}).get("prompt_tokens") or 0 for r in ok)
    total_tokens_out = sum(r.get("usage", {}).get("completion_tokens") or 0 for r in ok)
    est_cost = (total_tokens_in * 0.25 + total_tokens_out * 1.80) / 1_000_000

    print(f"  Passed:  {len(ok)}/{len(results)}")
    print(f"  Failed:  {len(err)}/{len(results)}")
    print(f"  Tokens:  {total_tokens_in} in / {total_tokens_out} out")
    print(f"  Est cost: ${est_cost:.4f}")
    if ok:
        print(f"  Avg latency: {sum(r['elapsed_s'] for r in ok) / len(ok):.1f}s")

    high_conf = [r for r in ok if r["result"].get("confidence") == "high"]
    print(f"  High confidence: {len(high_conf)}/{len(ok)}")

    # Save results
    results_path = out_dir / "results.json"
    with open(results_path, "w") as f:
        json.dump(
            {
                "meta": {
                    "url": args.url,
                    "model": args.model,
                    "browser": "dev-browser --connect (live Chrome)",
                    "timestamp": time.strftime("%Y-%m-%dT%H:%M:%S%z"),
                    "task_count": len(tasks),
                },
                "summary": {
                    "passed": len(ok),
                    "failed": len(err),
                    "tokens_in": total_tokens_in,
                    "tokens_out": total_tokens_out,
                    "est_cost_usd": round(est_cost, 6),
                },
                "tasks": results,
            },
            f,
            indent=2,
        )
    print(f"\n  Results saved to {results_path}")
    print(f"  Screenshots in {screenshots_dir}/\n")


 if __name__ == "__main__":
    main()
	#!/usr/bin/env python3
	"""
	Holo3 TAC UI Eval — test computer-use VLM on TAC surfaces.

	Uses dev-browser (https://github.com/SawyerHood/dev-browser) to connect
	to your running Chrome with the TAC extension installed, take screenshots
	of real extension + SPA surfaces, and send them to Holo3 for evaluation.

	Setup:
	npm install -g dev-browser && dev-browser install
	pip install openai
	# Launch Chrome with remote debugging:
	# Mac: /Applications/Google\ Chrome.app/Contents/MacOS/Google\ Chrome --remote-debugging-port=9222
	# Or enable at chrome://inspect/#remote-debugging
	export HAI_API_KEY="your-key-from-portal.hcompany.ai"

	Usage:
	python3 holo3-tac-eval.py [--url https://staging.talkandcomment.com] [--model holo3-35b-a3b]
	python3 holo3-tac-eval.py --tasks locate-record-btn describe-layout
	"""

	import argparse
	import base64
	import json
	import os
	import subprocess
	import sys
	import time
	from pathlib import Path

	try:
	from openai import OpenAI
	except ImportError:
	sys.exit("pip install openai")


	# --- Config ---

	EVAL_TASKS = [
	{
	"id": "locate-record-btn",
	"setup": """
	const page = await browser.getPage("tac-eval");
	await page.goto("{base_url}/app", {{ waitUntil: "networkidle", timeout: 15000 }});
	await page.waitForTimeout(2000);
	const buf = await page.screenshot({{ type: "png" }});
	await saveScreenshot(buf, "app-main.png");
	console.log("screenshot:app-main.png");
	""",
	"instruction": "Find the button or element that starts a voice recording. Return its label text and approximate x,y coordinates.",
	"schema": {
	"type": "object",
	"properties": {
	"element_label": {"type": "string", "description": "The visible text or aria-label of the record button"},
	"x": {"type": "integer", "description": "Approximate x coordinate of the element center"},
	"y": {"type": "integer", "description": "Approximate y coordinate of the element center"},
	"confidence": {"type": "string", "enum": ["high", "medium", "low"]},
	},
	"required": ["element_label", "x", "y", "confidence"],
	},
	},
	{
	"id": "identify-plan-badge",
	"setup": """
	const page = await browser.getPage("tac-eval");
	await page.goto("{base_url}/app", {{ waitUntil: "networkidle", timeout: 15000 }});
	await page.waitForTimeout(2000);
	const buf = await page.screenshot({{ type: "png" }});
	await saveScreenshot(buf, "app-plan.png");
	console.log("screenshot:app-plan.png");
	""",
	"instruction": "Look at the user interface and identify what subscription plan the user is on. Find the plan badge or label and return the plan name and its location.",
	"schema": {
	"type": "object",
	"properties": {
	"plan_name": {"type": "string", "description": "The subscription plan shown (e.g. Free, Trial, Teacher Pro)"},
	"badge_text": {"type": "string", "description": "Exact text of the badge element"},
	"x": {"type": "integer"},
	"y": {"type": "integer"},
	"confidence": {"type": "string", "enum": ["high", "medium", "low"]},
	},
	"required": ["plan_name", "confidence"],
	},
	},
	{
	"id": "find-upgrade-cta",
	"setup": """
	const page = await browser.getPage("tac-eval");
	await page.goto("{base_url}/pricing", {{ waitUntil: "networkidle", timeout: 15000 }});
	await page.waitForTimeout(2000);
	const buf = await page.screenshot({{ type: "png" }});
	await saveScreenshot(buf, "pricing.png");
	console.log("screenshot:pricing.png");
	""",
	"instruction": "Find the primary call-to-action button for upgrading to a paid plan. Return the button text, its coordinates, and what plan it promotes.",
	"schema": {
	"type": "object",
	"properties": {
	"cta_text": {"type": "string", "description": "The text on the upgrade button"},
	"promoted_plan": {"type": "string", "description": "Which plan the CTA promotes"},
	"price_shown": {"type": "string", "description": "Any price visible near the CTA"},
	"x": {"type": "integer"},
	"y": {"type": "integer"},
	"confidence": {"type": "string", "enum": ["high", "medium", "low"]},
	},
	"required": ["cta_text", "confidence"],
	},
	},
	{
	"id": "describe-layout",
	"setup": """
	const page = await browser.getPage("tac-eval");
	await page.goto("{base_url}/app", {{ waitUntil: "networkidle", timeout: 15000 }});
	await page.waitForTimeout(2000);
	const buf = await page.screenshot({{ type: "png" }});
	await saveScreenshot(buf, "app-layout.png");
	console.log("screenshot:app-layout.png");
	""",
	"instruction": "Describe the overall layout of this web application. What are the main sections visible? Is there a sidebar, header, main content area? List the key UI components you can identify.",
	"schema": {
	"type": "object",
	"properties": {
	"layout_type": {"type": "string", "description": "e.g. sidebar+main, single-column, dashboard-grid"},
	"sections": {
	"type": "array",
	"items": {"type": "string"},
	"description": "List of main UI sections visible",
	},
	"key_components": {
	"type": "array",
	"items": {"type": "string"},
	"description": "Notable UI components (buttons, lists, cards, etc.)",
	},
	"overall_purpose": {"type": "string", "description": "What does this app appear to do?"},
	},
	"required": ["sections", "key_components", "overall_purpose"],
	},
	},
	{
	"id": "extension-floater",
	"setup": """
	const page = await browser.getPage("tac-eval");
	await page.goto("{base_url}/app", {{ waitUntil: "networkidle", timeout: 15000 }});
	await page.waitForTimeout(3000);
	// The TAC floater is injected by the Chrome extension as a shadow DOM element.
	// Screenshot the full page — if the extension is loaded, the floater should be visible.
	const buf = await page.screenshot({{ type: "png" }});
	await saveScreenshot(buf, "app-with-ext.png");
	console.log("screenshot:app-with-ext.png");
	""",
	"instruction": "Look for a floating widget or overlay element that appears to be injected by a browser extension (not part of the main page). It may be a small circular button or a recording widget, likely positioned at the edge or corner of the viewport. Describe what you see and its location.",
	"schema": {
	"type": "object",
	"properties": {
	"floater_found": {"type": "boolean", "description": "Whether a floating extension widget was detected"},
	"description": {"type": "string", "description": "What the floater looks like"},
	"position": {"type": "string", "description": "Where on screen (e.g. bottom-right, top-left)"},
	"x": {"type": "integer"},
	"y": {"type": "integer"},
	"confidence": {"type": "string", "enum": ["high", "medium", "low"]},
	},
	"required": ["floater_found", "confidence"],
	},
	},
	{
	"id": "nav-to-settings",
	"setup": """
	const page = await browser.getPage("tac-eval");
	await page.goto("{base_url}/app", {{ waitUntil: "networkidle", timeout: 15000 }});
	await page.waitForTimeout(2000);
	const buf = await page.screenshot({{ type: "png" }});
	await saveScreenshot(buf, "app-nav.png");
	console.log("screenshot:app-nav.png");
	""",
	"instruction": "I want to navigate to the account or settings page. What element should I click? Return the exact text/icon and coordinates of the navigation element.",
	"schema": {
	"type": "object",
	"properties": {
	"element_description": {"type": "string"},
	"element_text": {"type": "string"},
	"x": {"type": "integer"},
	"y": {"type": "integer"},
	"confidence": {"type": "string", "enum": ["high", "medium", "low"]},
	},
	"required": ["element_description", "x", "y", "confidence"],
	},
	},
	]

	DEV_BROWSER_TMP = Path.home() / ".dev-browser" / "tmp"


	def run_dev_browser(script: str) -> str:
	"""Run a script via dev-browser --connect, return stdout."""
	result = subprocess.run(
	["dev-browser", "--connect"],
	input=script,
	capture_output=True,
	text=True,
	timeout=30,
	)
	if result.returncode != 0:
	raise RuntimeError(f"dev-browser failed: {result.stderr.strip()}")
	return result.stdout.strip()


	def take_screenshot(task: dict, base_url: str, out_dir: Path) -> Path:
	"""Run the task's dev-browser setup script, return screenshot path."""
	script = task["setup"].replace("{base_url}", base_url)
	output = run_dev_browser(script)

	# Parse screenshot filename from console.log output
	for line in output.splitlines():
	if line.startswith("screenshot:"):
	fname = line.split(":", 1)[1].strip()
	src = DEV_BROWSER_TMP / fname
	if src.exists():
	dst = out_dir / fname
	dst.write_bytes(src.read_bytes())
	return dst

	raise RuntimeError(f"No screenshot produced. dev-browser output:\n{output}")


	def encode_image(path: Path) -> str:
	return base64.b64encode(path.read_bytes()).decode("utf-8")


	def run_task(client, model: str, task: dict, screenshot_path: Path) -> dict:
	"""Send a screenshot + instruction to Holo3, get structured response."""
	b64 = encode_image(screenshot_path)

	messages = [
	{
	"role": "user",
	"content": [
	{"type": "text", "text": task["instruction"]},
	{
	"type": "image_url",
	"image_url": {"url": f"data:image/png;base64,{b64}"},
	},
	],
	}
	]

	t0 = time.time()
	try:
	response = client.chat.completions.create(
	model=model,
	messages=messages,
	extra_body={"structured_outputs": {"json": task["schema"]}},
	temperature=0.0,
	)
	elapsed = time.time() - t0
	content = response.choices[0].message.content
	usage = {
	"prompt_tokens": response.usage.prompt_tokens if response.usage else None,
	"completion_tokens": response.usage.completion_tokens if response.usage else None,
	}

	try:
	parsed = json.loads(content)
	except json.JSONDecodeError:
	parsed = {"raw": content, "parse_error": True}

	return {
	"task_id": task["id"],
	"status": "ok",
	"result": parsed,
	"raw": content,
	"elapsed_s": round(elapsed, 2),
	"usage": usage,
	}
	except Exception as e:
	return {
	"task_id": task["id"],
	"status": "error",
	"error": str(e),
	"elapsed_s": round(time.time() - t0, 2),
	}


	def check_dev_browser():
	"""Verify dev-browser is installed and can connect."""
	try:
	result = subprocess.run(
	["dev-browser", "--connect"],
	input='const tabs = await browser.listPages(); console.log(JSON.stringify(tabs.length));',
	capture_output=True, text=True, timeout=10,
	)
	if result.returncode != 0:
	print(f"⚠️ dev-browser --connect failed: {result.stderr.strip()}")
	print(" Make sure Chrome is running with --remote-debugging-port=9222")
	print(" Or enable remote debugging at chrome://inspect/#remote-debugging")
	sys.exit(1)
	tab_count = result.stdout.strip()
	print(f" ✅ Connected to Chrome ({tab_count} tabs)")
	except FileNotFoundError:
	sys.exit("dev-browser not found. Install: npm install -g dev-browser && dev-browser install")


	def main():
	parser = argparse.ArgumentParser(description="Holo3 TAC UI Eval (dev-browser)")
	parser.add_argument("--url", default="https://staging.talkandcomment.com", help="Base URL to test")
	parser.add_argument("--model", default="holo3-35b-a3b", help="Model ID")
	parser.add_argument("--api-key", default=os.environ.get("HAI_API_KEY"), help="H Company API key")
	parser.add_argument("--out", default="./holo3-eval-results", help="Output directory")
	parser.add_argument("--tasks", nargs="*", help="Run specific task IDs only")
	parser.add_argument("--screenshot-only", action="store_true", help="Only take screenshots, skip Holo3 inference")
	args = parser.parse_args()

	if not args.screenshot_only and not args.api_key:
	sys.exit("Set HAI_API_KEY env var or pass --api-key. Get one free at https://portal.hcompany.ai/")

	out_dir = Path(args.out)
	out_dir.mkdir(parents=True, exist_ok=True)
	screenshots_dir = out_dir / "screenshots"
	screenshots_dir.mkdir(exist_ok=True)

	tasks = EVAL_TASKS
	if args.tasks:
	tasks = [t for t in tasks if t["id"] in args.tasks]

	print(f"\n🔍 Holo3 TAC UI Eval (dev-browser)")
	print(f" Target: {args.url}")
	print(f" Model: {args.model}")
	print(f" Tasks: {len(tasks)}")
	print(f" Output: {out_dir}")
	print(f"\n🔌 Checking dev-browser connection...")
	check_dev_browser()

	# Phase 1: Screenshots via dev-browser --connect
	print(f"\n📷 Phase 1: Capturing screenshots from live Chrome...")
	screenshot_map = {}
	for task in tasks:
	print(f" 📸 {task['id']}...")
	try:
	screenshot_map[task["id"]] = take_screenshot(task, args.url, screenshots_dir)
	print(f" ✅ {screenshot_map[task['id']]}")
	except Exception as e:
	print(f" ❌ {e}")
	screenshot_map[task["id"]] = None

	if args.screenshot_only:
	print(f"\n📸 Screenshots saved to {screenshots_dir}/")
	return

	# Phase 2: Run Holo3 inference
	client = OpenAI(base_url="https://api.hcompany.ai/v1/", api_key=args.api_key)

	print(f"\n🤖 Phase 2: Running Holo3 eval tasks...")
	results = []
	for i, task in enumerate(tasks, 1):
	ss = screenshot_map.get(task["id"])
	if not ss:
	results.append({"task_id": task["id"], "status": "skipped", "error": "no screenshot"})
	continue

	print(f"\n [{i}/{len(tasks)}] {task['id']}")
	print(f" Instruction: {task['instruction'][:80]}...")
	result = run_task(client, args.model, task, ss)
	results.append(result)

	if result["status"] == "ok":
	print(f" ✅ {result['elapsed_s']}s \| tokens: {result['usage']}")
	for k, v in result["result"].items():
	val = str(v)
	if len(val) > 100:
	val = val[:100] + "..."
	print(f" {k}: {val}")
	else:
	print(f" ❌ {result['error']}")

	# Respect free tier rate limit (10 RPM)
	if i < len(tasks):
	time.sleep(7)

	# Phase 3: Summary
	print("\n" + "=" * 60)
	print("📊 EVAL SUMMARY")
	print("=" * 60)

	ok = [r for r in results if r["status"] == "ok"]
	err = [r for r in results if r["status"] in ("error", "skipped")]
	total_tokens_in = sum(r.get("usage", {}).get("prompt_tokens") or 0 for r in ok)
	total_tokens_out = sum(r.get("usage", {}).get("completion_tokens") or 0 for r in ok)
	est_cost = (total_tokens_in * 0.25 + total_tokens_out * 1.80) / 1_000_000

	print(f" Passed: {len(ok)}/{len(results)}")
	print(f" Failed: {len(err)}/{len(results)}")
	print(f" Tokens: {total_tokens_in} in / {total_tokens_out} out")
	print(f" Est cost: ${est_cost:.4f}")
	if ok:
	print(f" Avg latency: {sum(r['elapsed_s'] for r in ok) / len(ok):.1f}s")

	high_conf = [r for r in ok if r["result"].get("confidence") == "high"]
	print(f" High confidence: {len(high_conf)}/{len(ok)}")

	# Save results
	results_path = out_dir / "results.json"
	with open(results_path, "w") as f:
	json.dump(
	{
	"meta": {
	"url": args.url,
	"model": args.model,
	"browser": "dev-browser --connect (live Chrome)",
	"timestamp": time.strftime("%Y-%m-%dT%H:%M:%S%z"),
	"task_count": len(tasks),
	},
	"summary": {
	"passed": len(ok),
	"failed": len(err),
	"tokens_in": total_tokens_in,
	"tokens_out": total_tokens_out,
	"est_cost_usd": round(est_cost, 6),
	},
	"tasks": results,
	},
	f,
	indent=2,
	)
	print(f"\n Results saved to {results_path}")
	print(f" Screenshots in {screenshots_dir}/\n")


	if __name__ == "__main__":
	main()
No results found