Last active
April 2, 2026 19:55
-
-
Save zakelfassi/7e1f98c5f0ea150ac90ca463fbcc54a3 to your computer and use it in GitHub Desktop.
Holo3 TAC UI Eval — test computer-use VLM on Talk & Comment surfaces
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """ | |
| Holo3 TAC UI Eval — test computer-use VLM on TAC surfaces. | |
| Uses dev-browser (https://github.com/SawyerHood/dev-browser) to connect | |
| to your running Chrome with the TAC extension installed, take screenshots | |
| of real extension + SPA surfaces, and send them to Holo3 for evaluation. | |
| Setup: | |
| npm install -g dev-browser && dev-browser install | |
| pip install openai | |
| # Launch Chrome with remote debugging: | |
| # Mac: /Applications/Google\ Chrome.app/Contents/MacOS/Google\ Chrome --remote-debugging-port=9222 | |
| # Or enable at chrome://inspect/#remote-debugging | |
| export HAI_API_KEY="your-key-from-portal.hcompany.ai" | |
| Usage: | |
| python3 holo3-tac-eval.py [--url https://staging.talkandcomment.com] [--model holo3-35b-a3b] | |
| python3 holo3-tac-eval.py --tasks locate-record-btn describe-layout | |
| """ | |
| import argparse | |
| import base64 | |
| import json | |
| import os | |
| import subprocess | |
| import sys | |
| import time | |
| from pathlib import Path | |
| try: | |
| from openai import OpenAI | |
| except ImportError: | |
| sys.exit("pip install openai") | |
| # --- Config --- | |
| EVAL_TASKS = [ | |
| { | |
| "id": "locate-record-btn", | |
| "setup": """ | |
| const page = await browser.getPage("tac-eval"); | |
| await page.goto("{base_url}/app", {{ waitUntil: "networkidle", timeout: 15000 }}); | |
| await page.waitForTimeout(2000); | |
| const buf = await page.screenshot({{ type: "png" }}); | |
| await saveScreenshot(buf, "app-main.png"); | |
| console.log("screenshot:app-main.png"); | |
| """, | |
| "instruction": "Find the button or element that starts a voice recording. Return its label text and approximate x,y coordinates.", | |
| "schema": { | |
| "type": "object", | |
| "properties": { | |
| "element_label": {"type": "string", "description": "The visible text or aria-label of the record button"}, | |
| "x": {"type": "integer", "description": "Approximate x coordinate of the element center"}, | |
| "y": {"type": "integer", "description": "Approximate y coordinate of the element center"}, | |
| "confidence": {"type": "string", "enum": ["high", "medium", "low"]}, | |
| }, | |
| "required": ["element_label", "x", "y", "confidence"], | |
| }, | |
| }, | |
| { | |
| "id": "identify-plan-badge", | |
| "setup": """ | |
| const page = await browser.getPage("tac-eval"); | |
| await page.goto("{base_url}/app", {{ waitUntil: "networkidle", timeout: 15000 }}); | |
| await page.waitForTimeout(2000); | |
| const buf = await page.screenshot({{ type: "png" }}); | |
| await saveScreenshot(buf, "app-plan.png"); | |
| console.log("screenshot:app-plan.png"); | |
| """, | |
| "instruction": "Look at the user interface and identify what subscription plan the user is on. Find the plan badge or label and return the plan name and its location.", | |
| "schema": { | |
| "type": "object", | |
| "properties": { | |
| "plan_name": {"type": "string", "description": "The subscription plan shown (e.g. Free, Trial, Teacher Pro)"}, | |
| "badge_text": {"type": "string", "description": "Exact text of the badge element"}, | |
| "x": {"type": "integer"}, | |
| "y": {"type": "integer"}, | |
| "confidence": {"type": "string", "enum": ["high", "medium", "low"]}, | |
| }, | |
| "required": ["plan_name", "confidence"], | |
| }, | |
| }, | |
| { | |
| "id": "find-upgrade-cta", | |
| "setup": """ | |
| const page = await browser.getPage("tac-eval"); | |
| await page.goto("{base_url}/pricing", {{ waitUntil: "networkidle", timeout: 15000 }}); | |
| await page.waitForTimeout(2000); | |
| const buf = await page.screenshot({{ type: "png" }}); | |
| await saveScreenshot(buf, "pricing.png"); | |
| console.log("screenshot:pricing.png"); | |
| """, | |
| "instruction": "Find the primary call-to-action button for upgrading to a paid plan. Return the button text, its coordinates, and what plan it promotes.", | |
| "schema": { | |
| "type": "object", | |
| "properties": { | |
| "cta_text": {"type": "string", "description": "The text on the upgrade button"}, | |
| "promoted_plan": {"type": "string", "description": "Which plan the CTA promotes"}, | |
| "price_shown": {"type": "string", "description": "Any price visible near the CTA"}, | |
| "x": {"type": "integer"}, | |
| "y": {"type": "integer"}, | |
| "confidence": {"type": "string", "enum": ["high", "medium", "low"]}, | |
| }, | |
| "required": ["cta_text", "confidence"], | |
| }, | |
| }, | |
| { | |
| "id": "describe-layout", | |
| "setup": """ | |
| const page = await browser.getPage("tac-eval"); | |
| await page.goto("{base_url}/app", {{ waitUntil: "networkidle", timeout: 15000 }}); | |
| await page.waitForTimeout(2000); | |
| const buf = await page.screenshot({{ type: "png" }}); | |
| await saveScreenshot(buf, "app-layout.png"); | |
| console.log("screenshot:app-layout.png"); | |
| """, | |
| "instruction": "Describe the overall layout of this web application. What are the main sections visible? Is there a sidebar, header, main content area? List the key UI components you can identify.", | |
| "schema": { | |
| "type": "object", | |
| "properties": { | |
| "layout_type": {"type": "string", "description": "e.g. sidebar+main, single-column, dashboard-grid"}, | |
| "sections": { | |
| "type": "array", | |
| "items": {"type": "string"}, | |
| "description": "List of main UI sections visible", | |
| }, | |
| "key_components": { | |
| "type": "array", | |
| "items": {"type": "string"}, | |
| "description": "Notable UI components (buttons, lists, cards, etc.)", | |
| }, | |
| "overall_purpose": {"type": "string", "description": "What does this app appear to do?"}, | |
| }, | |
| "required": ["sections", "key_components", "overall_purpose"], | |
| }, | |
| }, | |
| { | |
| "id": "extension-floater", | |
| "setup": """ | |
| const page = await browser.getPage("tac-eval"); | |
| await page.goto("{base_url}/app", {{ waitUntil: "networkidle", timeout: 15000 }}); | |
| await page.waitForTimeout(3000); | |
| // The TAC floater is injected by the Chrome extension as a shadow DOM element. | |
| // Screenshot the full page — if the extension is loaded, the floater should be visible. | |
| const buf = await page.screenshot({{ type: "png" }}); | |
| await saveScreenshot(buf, "app-with-ext.png"); | |
| console.log("screenshot:app-with-ext.png"); | |
| """, | |
| "instruction": "Look for a floating widget or overlay element that appears to be injected by a browser extension (not part of the main page). It may be a small circular button or a recording widget, likely positioned at the edge or corner of the viewport. Describe what you see and its location.", | |
| "schema": { | |
| "type": "object", | |
| "properties": { | |
| "floater_found": {"type": "boolean", "description": "Whether a floating extension widget was detected"}, | |
| "description": {"type": "string", "description": "What the floater looks like"}, | |
| "position": {"type": "string", "description": "Where on screen (e.g. bottom-right, top-left)"}, | |
| "x": {"type": "integer"}, | |
| "y": {"type": "integer"}, | |
| "confidence": {"type": "string", "enum": ["high", "medium", "low"]}, | |
| }, | |
| "required": ["floater_found", "confidence"], | |
| }, | |
| }, | |
| { | |
| "id": "nav-to-settings", | |
| "setup": """ | |
| const page = await browser.getPage("tac-eval"); | |
| await page.goto("{base_url}/app", {{ waitUntil: "networkidle", timeout: 15000 }}); | |
| await page.waitForTimeout(2000); | |
| const buf = await page.screenshot({{ type: "png" }}); | |
| await saveScreenshot(buf, "app-nav.png"); | |
| console.log("screenshot:app-nav.png"); | |
| """, | |
| "instruction": "I want to navigate to the account or settings page. What element should I click? Return the exact text/icon and coordinates of the navigation element.", | |
| "schema": { | |
| "type": "object", | |
| "properties": { | |
| "element_description": {"type": "string"}, | |
| "element_text": {"type": "string"}, | |
| "x": {"type": "integer"}, | |
| "y": {"type": "integer"}, | |
| "confidence": {"type": "string", "enum": ["high", "medium", "low"]}, | |
| }, | |
| "required": ["element_description", "x", "y", "confidence"], | |
| }, | |
| }, | |
| ] | |
| DEV_BROWSER_TMP = Path.home() / ".dev-browser" / "tmp" | |
| def run_dev_browser(script: str) -> str: | |
| """Run a script via dev-browser --connect, return stdout.""" | |
| result = subprocess.run( | |
| ["dev-browser", "--connect"], | |
| input=script, | |
| capture_output=True, | |
| text=True, | |
| timeout=30, | |
| ) | |
| if result.returncode != 0: | |
| raise RuntimeError(f"dev-browser failed: {result.stderr.strip()}") | |
| return result.stdout.strip() | |
| def take_screenshot(task: dict, base_url: str, out_dir: Path) -> Path: | |
| """Run the task's dev-browser setup script, return screenshot path.""" | |
| script = task["setup"].replace("{base_url}", base_url) | |
| output = run_dev_browser(script) | |
| # Parse screenshot filename from console.log output | |
| for line in output.splitlines(): | |
| if line.startswith("screenshot:"): | |
| fname = line.split(":", 1)[1].strip() | |
| src = DEV_BROWSER_TMP / fname | |
| if src.exists(): | |
| dst = out_dir / fname | |
| dst.write_bytes(src.read_bytes()) | |
| return dst | |
| raise RuntimeError(f"No screenshot produced. dev-browser output:\n{output}") | |
| def encode_image(path: Path) -> str: | |
| return base64.b64encode(path.read_bytes()).decode("utf-8") | |
| def run_task(client, model: str, task: dict, screenshot_path: Path) -> dict: | |
| """Send a screenshot + instruction to Holo3, get structured response.""" | |
| b64 = encode_image(screenshot_path) | |
| messages = [ | |
| { | |
| "role": "user", | |
| "content": [ | |
| {"type": "text", "text": task["instruction"]}, | |
| { | |
| "type": "image_url", | |
| "image_url": {"url": f"data:image/png;base64,{b64}"}, | |
| }, | |
| ], | |
| } | |
| ] | |
| t0 = time.time() | |
| try: | |
| response = client.chat.completions.create( | |
| model=model, | |
| messages=messages, | |
| extra_body={"structured_outputs": {"json": task["schema"]}}, | |
| temperature=0.0, | |
| ) | |
| elapsed = time.time() - t0 | |
| content = response.choices[0].message.content | |
| usage = { | |
| "prompt_tokens": response.usage.prompt_tokens if response.usage else None, | |
| "completion_tokens": response.usage.completion_tokens if response.usage else None, | |
| } | |
| try: | |
| parsed = json.loads(content) | |
| except json.JSONDecodeError: | |
| parsed = {"raw": content, "parse_error": True} | |
| return { | |
| "task_id": task["id"], | |
| "status": "ok", | |
| "result": parsed, | |
| "raw": content, | |
| "elapsed_s": round(elapsed, 2), | |
| "usage": usage, | |
| } | |
| except Exception as e: | |
| return { | |
| "task_id": task["id"], | |
| "status": "error", | |
| "error": str(e), | |
| "elapsed_s": round(time.time() - t0, 2), | |
| } | |
| def check_dev_browser(): | |
| """Verify dev-browser is installed and can connect.""" | |
| try: | |
| result = subprocess.run( | |
| ["dev-browser", "--connect"], | |
| input='const tabs = await browser.listPages(); console.log(JSON.stringify(tabs.length));', | |
| capture_output=True, text=True, timeout=10, | |
| ) | |
| if result.returncode != 0: | |
| print(f"⚠️ dev-browser --connect failed: {result.stderr.strip()}") | |
| print(" Make sure Chrome is running with --remote-debugging-port=9222") | |
| print(" Or enable remote debugging at chrome://inspect/#remote-debugging") | |
| sys.exit(1) | |
| tab_count = result.stdout.strip() | |
| print(f" ✅ Connected to Chrome ({tab_count} tabs)") | |
| except FileNotFoundError: | |
| sys.exit("dev-browser not found. Install: npm install -g dev-browser && dev-browser install") | |
| def main(): | |
| parser = argparse.ArgumentParser(description="Holo3 TAC UI Eval (dev-browser)") | |
| parser.add_argument("--url", default="https://staging.talkandcomment.com", help="Base URL to test") | |
| parser.add_argument("--model", default="holo3-35b-a3b", help="Model ID") | |
| parser.add_argument("--api-key", default=os.environ.get("HAI_API_KEY"), help="H Company API key") | |
| parser.add_argument("--out", default="./holo3-eval-results", help="Output directory") | |
| parser.add_argument("--tasks", nargs="*", help="Run specific task IDs only") | |
| parser.add_argument("--screenshot-only", action="store_true", help="Only take screenshots, skip Holo3 inference") | |
| args = parser.parse_args() | |
| if not args.screenshot_only and not args.api_key: | |
| sys.exit("Set HAI_API_KEY env var or pass --api-key. Get one free at https://portal.hcompany.ai/") | |
| out_dir = Path(args.out) | |
| out_dir.mkdir(parents=True, exist_ok=True) | |
| screenshots_dir = out_dir / "screenshots" | |
| screenshots_dir.mkdir(exist_ok=True) | |
| tasks = EVAL_TASKS | |
| if args.tasks: | |
| tasks = [t for t in tasks if t["id"] in args.tasks] | |
| print(f"\n🔍 Holo3 TAC UI Eval (dev-browser)") | |
| print(f" Target: {args.url}") | |
| print(f" Model: {args.model}") | |
| print(f" Tasks: {len(tasks)}") | |
| print(f" Output: {out_dir}") | |
| print(f"\n🔌 Checking dev-browser connection...") | |
| check_dev_browser() | |
| # Phase 1: Screenshots via dev-browser --connect | |
| print(f"\n📷 Phase 1: Capturing screenshots from live Chrome...") | |
| screenshot_map = {} | |
| for task in tasks: | |
| print(f" 📸 {task['id']}...") | |
| try: | |
| screenshot_map[task["id"]] = take_screenshot(task, args.url, screenshots_dir) | |
| print(f" ✅ {screenshot_map[task['id']]}") | |
| except Exception as e: | |
| print(f" ❌ {e}") | |
| screenshot_map[task["id"]] = None | |
| if args.screenshot_only: | |
| print(f"\n📸 Screenshots saved to {screenshots_dir}/") | |
| return | |
| # Phase 2: Run Holo3 inference | |
| client = OpenAI(base_url="https://api.hcompany.ai/v1/", api_key=args.api_key) | |
| print(f"\n🤖 Phase 2: Running Holo3 eval tasks...") | |
| results = [] | |
| for i, task in enumerate(tasks, 1): | |
| ss = screenshot_map.get(task["id"]) | |
| if not ss: | |
| results.append({"task_id": task["id"], "status": "skipped", "error": "no screenshot"}) | |
| continue | |
| print(f"\n [{i}/{len(tasks)}] {task['id']}") | |
| print(f" Instruction: {task['instruction'][:80]}...") | |
| result = run_task(client, args.model, task, ss) | |
| results.append(result) | |
| if result["status"] == "ok": | |
| print(f" ✅ {result['elapsed_s']}s | tokens: {result['usage']}") | |
| for k, v in result["result"].items(): | |
| val = str(v) | |
| if len(val) > 100: | |
| val = val[:100] + "..." | |
| print(f" {k}: {val}") | |
| else: | |
| print(f" ❌ {result['error']}") | |
| # Respect free tier rate limit (10 RPM) | |
| if i < len(tasks): | |
| time.sleep(7) | |
| # Phase 3: Summary | |
| print("\n" + "=" * 60) | |
| print("📊 EVAL SUMMARY") | |
| print("=" * 60) | |
| ok = [r for r in results if r["status"] == "ok"] | |
| err = [r for r in results if r["status"] in ("error", "skipped")] | |
| total_tokens_in = sum(r.get("usage", {}).get("prompt_tokens") or 0 for r in ok) | |
| total_tokens_out = sum(r.get("usage", {}).get("completion_tokens") or 0 for r in ok) | |
| est_cost = (total_tokens_in * 0.25 + total_tokens_out * 1.80) / 1_000_000 | |
| print(f" Passed: {len(ok)}/{len(results)}") | |
| print(f" Failed: {len(err)}/{len(results)}") | |
| print(f" Tokens: {total_tokens_in} in / {total_tokens_out} out") | |
| print(f" Est cost: ${est_cost:.4f}") | |
| if ok: | |
| print(f" Avg latency: {sum(r['elapsed_s'] for r in ok) / len(ok):.1f}s") | |
| high_conf = [r for r in ok if r["result"].get("confidence") == "high"] | |
| print(f" High confidence: {len(high_conf)}/{len(ok)}") | |
| # Save results | |
| results_path = out_dir / "results.json" | |
| with open(results_path, "w") as f: | |
| json.dump( | |
| { | |
| "meta": { | |
| "url": args.url, | |
| "model": args.model, | |
| "browser": "dev-browser --connect (live Chrome)", | |
| "timestamp": time.strftime("%Y-%m-%dT%H:%M:%S%z"), | |
| "task_count": len(tasks), | |
| }, | |
| "summary": { | |
| "passed": len(ok), | |
| "failed": len(err), | |
| "tokens_in": total_tokens_in, | |
| "tokens_out": total_tokens_out, | |
| "est_cost_usd": round(est_cost, 6), | |
| }, | |
| "tasks": results, | |
| }, | |
| f, | |
| indent=2, | |
| ) | |
| print(f"\n Results saved to {results_path}") | |
| print(f" Screenshots in {screenshots_dir}/\n") | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment