Skip to content

Instantly share code, notes, and snippets.

@zakelfassi
Last active April 2, 2026 19:55
Show Gist options
  • Select an option

  • Save zakelfassi/7e1f98c5f0ea150ac90ca463fbcc54a3 to your computer and use it in GitHub Desktop.

Select an option

Save zakelfassi/7e1f98c5f0ea150ac90ca463fbcc54a3 to your computer and use it in GitHub Desktop.
Holo3 TAC UI Eval — test computer-use VLM on Talk & Comment surfaces
#!/usr/bin/env python3
"""
Holo3 TAC UI Eval — test computer-use VLM on TAC surfaces.
Uses dev-browser (https://github.com/SawyerHood/dev-browser) to connect
to your running Chrome with the TAC extension installed, take screenshots
of real extension + SPA surfaces, and send them to Holo3 for evaluation.
Setup:
npm install -g dev-browser && dev-browser install
pip install openai
# Launch Chrome with remote debugging:
# Mac: /Applications/Google\ Chrome.app/Contents/MacOS/Google\ Chrome --remote-debugging-port=9222
# Or enable at chrome://inspect/#remote-debugging
export HAI_API_KEY="your-key-from-portal.hcompany.ai"
Usage:
python3 holo3-tac-eval.py [--url https://staging.talkandcomment.com] [--model holo3-35b-a3b]
python3 holo3-tac-eval.py --tasks locate-record-btn describe-layout
"""
import argparse
import base64
import json
import os
import subprocess
import sys
import time
from pathlib import Path
try:
from openai import OpenAI
except ImportError:
sys.exit("pip install openai")
# --- Config ---
EVAL_TASKS = [
{
"id": "locate-record-btn",
"setup": """
const page = await browser.getPage("tac-eval");
await page.goto("{base_url}/app", {{ waitUntil: "networkidle", timeout: 15000 }});
await page.waitForTimeout(2000);
const buf = await page.screenshot({{ type: "png" }});
await saveScreenshot(buf, "app-main.png");
console.log("screenshot:app-main.png");
""",
"instruction": "Find the button or element that starts a voice recording. Return its label text and approximate x,y coordinates.",
"schema": {
"type": "object",
"properties": {
"element_label": {"type": "string", "description": "The visible text or aria-label of the record button"},
"x": {"type": "integer", "description": "Approximate x coordinate of the element center"},
"y": {"type": "integer", "description": "Approximate y coordinate of the element center"},
"confidence": {"type": "string", "enum": ["high", "medium", "low"]},
},
"required": ["element_label", "x", "y", "confidence"],
},
},
{
"id": "identify-plan-badge",
"setup": """
const page = await browser.getPage("tac-eval");
await page.goto("{base_url}/app", {{ waitUntil: "networkidle", timeout: 15000 }});
await page.waitForTimeout(2000);
const buf = await page.screenshot({{ type: "png" }});
await saveScreenshot(buf, "app-plan.png");
console.log("screenshot:app-plan.png");
""",
"instruction": "Look at the user interface and identify what subscription plan the user is on. Find the plan badge or label and return the plan name and its location.",
"schema": {
"type": "object",
"properties": {
"plan_name": {"type": "string", "description": "The subscription plan shown (e.g. Free, Trial, Teacher Pro)"},
"badge_text": {"type": "string", "description": "Exact text of the badge element"},
"x": {"type": "integer"},
"y": {"type": "integer"},
"confidence": {"type": "string", "enum": ["high", "medium", "low"]},
},
"required": ["plan_name", "confidence"],
},
},
{
"id": "find-upgrade-cta",
"setup": """
const page = await browser.getPage("tac-eval");
await page.goto("{base_url}/pricing", {{ waitUntil: "networkidle", timeout: 15000 }});
await page.waitForTimeout(2000);
const buf = await page.screenshot({{ type: "png" }});
await saveScreenshot(buf, "pricing.png");
console.log("screenshot:pricing.png");
""",
"instruction": "Find the primary call-to-action button for upgrading to a paid plan. Return the button text, its coordinates, and what plan it promotes.",
"schema": {
"type": "object",
"properties": {
"cta_text": {"type": "string", "description": "The text on the upgrade button"},
"promoted_plan": {"type": "string", "description": "Which plan the CTA promotes"},
"price_shown": {"type": "string", "description": "Any price visible near the CTA"},
"x": {"type": "integer"},
"y": {"type": "integer"},
"confidence": {"type": "string", "enum": ["high", "medium", "low"]},
},
"required": ["cta_text", "confidence"],
},
},
{
"id": "describe-layout",
"setup": """
const page = await browser.getPage("tac-eval");
await page.goto("{base_url}/app", {{ waitUntil: "networkidle", timeout: 15000 }});
await page.waitForTimeout(2000);
const buf = await page.screenshot({{ type: "png" }});
await saveScreenshot(buf, "app-layout.png");
console.log("screenshot:app-layout.png");
""",
"instruction": "Describe the overall layout of this web application. What are the main sections visible? Is there a sidebar, header, main content area? List the key UI components you can identify.",
"schema": {
"type": "object",
"properties": {
"layout_type": {"type": "string", "description": "e.g. sidebar+main, single-column, dashboard-grid"},
"sections": {
"type": "array",
"items": {"type": "string"},
"description": "List of main UI sections visible",
},
"key_components": {
"type": "array",
"items": {"type": "string"},
"description": "Notable UI components (buttons, lists, cards, etc.)",
},
"overall_purpose": {"type": "string", "description": "What does this app appear to do?"},
},
"required": ["sections", "key_components", "overall_purpose"],
},
},
{
"id": "extension-floater",
"setup": """
const page = await browser.getPage("tac-eval");
await page.goto("{base_url}/app", {{ waitUntil: "networkidle", timeout: 15000 }});
await page.waitForTimeout(3000);
// The TAC floater is injected by the Chrome extension as a shadow DOM element.
// Screenshot the full page — if the extension is loaded, the floater should be visible.
const buf = await page.screenshot({{ type: "png" }});
await saveScreenshot(buf, "app-with-ext.png");
console.log("screenshot:app-with-ext.png");
""",
"instruction": "Look for a floating widget or overlay element that appears to be injected by a browser extension (not part of the main page). It may be a small circular button or a recording widget, likely positioned at the edge or corner of the viewport. Describe what you see and its location.",
"schema": {
"type": "object",
"properties": {
"floater_found": {"type": "boolean", "description": "Whether a floating extension widget was detected"},
"description": {"type": "string", "description": "What the floater looks like"},
"position": {"type": "string", "description": "Where on screen (e.g. bottom-right, top-left)"},
"x": {"type": "integer"},
"y": {"type": "integer"},
"confidence": {"type": "string", "enum": ["high", "medium", "low"]},
},
"required": ["floater_found", "confidence"],
},
},
{
"id": "nav-to-settings",
"setup": """
const page = await browser.getPage("tac-eval");
await page.goto("{base_url}/app", {{ waitUntil: "networkidle", timeout: 15000 }});
await page.waitForTimeout(2000);
const buf = await page.screenshot({{ type: "png" }});
await saveScreenshot(buf, "app-nav.png");
console.log("screenshot:app-nav.png");
""",
"instruction": "I want to navigate to the account or settings page. What element should I click? Return the exact text/icon and coordinates of the navigation element.",
"schema": {
"type": "object",
"properties": {
"element_description": {"type": "string"},
"element_text": {"type": "string"},
"x": {"type": "integer"},
"y": {"type": "integer"},
"confidence": {"type": "string", "enum": ["high", "medium", "low"]},
},
"required": ["element_description", "x", "y", "confidence"],
},
},
]
DEV_BROWSER_TMP = Path.home() / ".dev-browser" / "tmp"
def run_dev_browser(script: str) -> str:
"""Run a script via dev-browser --connect, return stdout."""
result = subprocess.run(
["dev-browser", "--connect"],
input=script,
capture_output=True,
text=True,
timeout=30,
)
if result.returncode != 0:
raise RuntimeError(f"dev-browser failed: {result.stderr.strip()}")
return result.stdout.strip()
def take_screenshot(task: dict, base_url: str, out_dir: Path) -> Path:
"""Run the task's dev-browser setup script, return screenshot path."""
script = task["setup"].replace("{base_url}", base_url)
output = run_dev_browser(script)
# Parse screenshot filename from console.log output
for line in output.splitlines():
if line.startswith("screenshot:"):
fname = line.split(":", 1)[1].strip()
src = DEV_BROWSER_TMP / fname
if src.exists():
dst = out_dir / fname
dst.write_bytes(src.read_bytes())
return dst
raise RuntimeError(f"No screenshot produced. dev-browser output:\n{output}")
def encode_image(path: Path) -> str:
return base64.b64encode(path.read_bytes()).decode("utf-8")
def run_task(client, model: str, task: dict, screenshot_path: Path) -> dict:
"""Send a screenshot + instruction to Holo3, get structured response."""
b64 = encode_image(screenshot_path)
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": task["instruction"]},
{
"type": "image_url",
"image_url": {"url": f"data:image/png;base64,{b64}"},
},
],
}
]
t0 = time.time()
try:
response = client.chat.completions.create(
model=model,
messages=messages,
extra_body={"structured_outputs": {"json": task["schema"]}},
temperature=0.0,
)
elapsed = time.time() - t0
content = response.choices[0].message.content
usage = {
"prompt_tokens": response.usage.prompt_tokens if response.usage else None,
"completion_tokens": response.usage.completion_tokens if response.usage else None,
}
try:
parsed = json.loads(content)
except json.JSONDecodeError:
parsed = {"raw": content, "parse_error": True}
return {
"task_id": task["id"],
"status": "ok",
"result": parsed,
"raw": content,
"elapsed_s": round(elapsed, 2),
"usage": usage,
}
except Exception as e:
return {
"task_id": task["id"],
"status": "error",
"error": str(e),
"elapsed_s": round(time.time() - t0, 2),
}
def check_dev_browser():
"""Verify dev-browser is installed and can connect."""
try:
result = subprocess.run(
["dev-browser", "--connect"],
input='const tabs = await browser.listPages(); console.log(JSON.stringify(tabs.length));',
capture_output=True, text=True, timeout=10,
)
if result.returncode != 0:
print(f"⚠️ dev-browser --connect failed: {result.stderr.strip()}")
print(" Make sure Chrome is running with --remote-debugging-port=9222")
print(" Or enable remote debugging at chrome://inspect/#remote-debugging")
sys.exit(1)
tab_count = result.stdout.strip()
print(f" ✅ Connected to Chrome ({tab_count} tabs)")
except FileNotFoundError:
sys.exit("dev-browser not found. Install: npm install -g dev-browser && dev-browser install")
def main():
parser = argparse.ArgumentParser(description="Holo3 TAC UI Eval (dev-browser)")
parser.add_argument("--url", default="https://staging.talkandcomment.com", help="Base URL to test")
parser.add_argument("--model", default="holo3-35b-a3b", help="Model ID")
parser.add_argument("--api-key", default=os.environ.get("HAI_API_KEY"), help="H Company API key")
parser.add_argument("--out", default="./holo3-eval-results", help="Output directory")
parser.add_argument("--tasks", nargs="*", help="Run specific task IDs only")
parser.add_argument("--screenshot-only", action="store_true", help="Only take screenshots, skip Holo3 inference")
args = parser.parse_args()
if not args.screenshot_only and not args.api_key:
sys.exit("Set HAI_API_KEY env var or pass --api-key. Get one free at https://portal.hcompany.ai/")
out_dir = Path(args.out)
out_dir.mkdir(parents=True, exist_ok=True)
screenshots_dir = out_dir / "screenshots"
screenshots_dir.mkdir(exist_ok=True)
tasks = EVAL_TASKS
if args.tasks:
tasks = [t for t in tasks if t["id"] in args.tasks]
print(f"\n🔍 Holo3 TAC UI Eval (dev-browser)")
print(f" Target: {args.url}")
print(f" Model: {args.model}")
print(f" Tasks: {len(tasks)}")
print(f" Output: {out_dir}")
print(f"\n🔌 Checking dev-browser connection...")
check_dev_browser()
# Phase 1: Screenshots via dev-browser --connect
print(f"\n📷 Phase 1: Capturing screenshots from live Chrome...")
screenshot_map = {}
for task in tasks:
print(f" 📸 {task['id']}...")
try:
screenshot_map[task["id"]] = take_screenshot(task, args.url, screenshots_dir)
print(f" ✅ {screenshot_map[task['id']]}")
except Exception as e:
print(f" ❌ {e}")
screenshot_map[task["id"]] = None
if args.screenshot_only:
print(f"\n📸 Screenshots saved to {screenshots_dir}/")
return
# Phase 2: Run Holo3 inference
client = OpenAI(base_url="https://api.hcompany.ai/v1/", api_key=args.api_key)
print(f"\n🤖 Phase 2: Running Holo3 eval tasks...")
results = []
for i, task in enumerate(tasks, 1):
ss = screenshot_map.get(task["id"])
if not ss:
results.append({"task_id": task["id"], "status": "skipped", "error": "no screenshot"})
continue
print(f"\n [{i}/{len(tasks)}] {task['id']}")
print(f" Instruction: {task['instruction'][:80]}...")
result = run_task(client, args.model, task, ss)
results.append(result)
if result["status"] == "ok":
print(f" ✅ {result['elapsed_s']}s | tokens: {result['usage']}")
for k, v in result["result"].items():
val = str(v)
if len(val) > 100:
val = val[:100] + "..."
print(f" {k}: {val}")
else:
print(f" ❌ {result['error']}")
# Respect free tier rate limit (10 RPM)
if i < len(tasks):
time.sleep(7)
# Phase 3: Summary
print("\n" + "=" * 60)
print("📊 EVAL SUMMARY")
print("=" * 60)
ok = [r for r in results if r["status"] == "ok"]
err = [r for r in results if r["status"] in ("error", "skipped")]
total_tokens_in = sum(r.get("usage", {}).get("prompt_tokens") or 0 for r in ok)
total_tokens_out = sum(r.get("usage", {}).get("completion_tokens") or 0 for r in ok)
est_cost = (total_tokens_in * 0.25 + total_tokens_out * 1.80) / 1_000_000
print(f" Passed: {len(ok)}/{len(results)}")
print(f" Failed: {len(err)}/{len(results)}")
print(f" Tokens: {total_tokens_in} in / {total_tokens_out} out")
print(f" Est cost: ${est_cost:.4f}")
if ok:
print(f" Avg latency: {sum(r['elapsed_s'] for r in ok) / len(ok):.1f}s")
high_conf = [r for r in ok if r["result"].get("confidence") == "high"]
print(f" High confidence: {len(high_conf)}/{len(ok)}")
# Save results
results_path = out_dir / "results.json"
with open(results_path, "w") as f:
json.dump(
{
"meta": {
"url": args.url,
"model": args.model,
"browser": "dev-browser --connect (live Chrome)",
"timestamp": time.strftime("%Y-%m-%dT%H:%M:%S%z"),
"task_count": len(tasks),
},
"summary": {
"passed": len(ok),
"failed": len(err),
"tokens_in": total_tokens_in,
"tokens_out": total_tokens_out,
"est_cost_usd": round(est_cost, 6),
},
"tasks": results,
},
f,
indent=2,
)
print(f"\n Results saved to {results_path}")
print(f" Screenshots in {screenshots_dir}/\n")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment