omayib · June 6, 2026 03:01
diff --git a/skill_benchmark.py b/skill_benchmark.py
 import time
 import json
 from datetime import datetime, timedelta
 import ollama

 # 1. ARRANGE: Define the full core skill environment
 GOOGLE_CALENDAR_SKILL = """
 You are an AI agent executing a Hermes skill. You must execute the correct CLI commands based on the rules below.

 ---
 name: google-calendar-create
 description: "Add a new Google Calendar event via the existing Google Workspace skill."
 ---
 ## Rules
 1. Confirm before creating with the user when possible. Show event fields before executing the command.
 2. Use ISO 8601 with timezone offset for all start/end values. Example for WIB: 2026-01-18T09:00:00+07:00
 3. If the time is all day, omit the time and offset: 2026-01-18.
 4. Use --attendees only if the user provides an email list. Otherwise omit.
 6. Avoid duplicate events: before creating, list events for the same date/time window and only create if the event is not already present.

 ## Usage
 GAPI="python $HOME/.hermes/skills/productivity/google-workspace/scripts/google_api.py"

 Create an event:
 $GAPI calendar create --summary "Standup" --start 2026-01-18T09:00:00+07:00 --end 2026-01-18T09:30:00+07:00

 Preflight check before creating events (Avoid duplicates by listing first):
 python3 .../google_api.py calendar list --start 2026-06-06T00:00:00+07:00 --end 2026-06-06T23:59:59+07:00

 Output format expected for execution: Output the exact bash command to run when requested.
 """

 CURRENT_TIME = datetime.now() 
 TOMORROW = CURRENT_TIME + timedelta(days=1)
 TOMORROW_DATE_STR = TOMORROW.strftime("%Y-%m-%d")

 def run_unified_benchmark(model_name):
    print(f"\n=============================================================")
    print(f"🔥 UNIFIED SKILL BENCHMARK: {model_name}")
    print(f"📅 Current System Time Context: {CURRENT_TIME.strftime('%Y-%m-%d %H:%M:%S')}")
    print(f"=============================================================")
    
    # Initialize metric accumulators
    metrics = {
        "p1_in": 0, "p1_out": 0, "p1_time": 0.0,
        "p2_in": 0, "p2_out": 0, "p2_time": 0.0,
        "p3_in": 0, "p3_out": 0, "p3_time": 0.0,
    }
    
    # --- PHASE 1: PROBE TASK REPRESENTATION & INTERNAL PLANNING ---
    conversation = [{"role": "system", "content": GOOGLE_CALENDAR_SKILL}]
    p1_prompt = (
        "Task: 'add meeting by 6 pm tomorrow'.\n\n"
        "Before writing any bash scripts, analyze this task. "
        "List the strict constraints from your instructions that apply, "
        "and explain your planned execution sequence."
    )
    conversation.append({"role": "user", "content": p1_prompt})
    
    start_time = time.time()
    response_1 = ollama.chat(model=model_name, messages=conversation, options={"temperature": 0.0})
    metrics["p1_time"] = time.time() - start_time
    metrics["p1_in"] = response_1.get('prompt_eval_count', 0)
    metrics["p1_out"] = response_1.get('eval_count', 0)
    
    print(f"🔍 [PHASE 1: Planning Probe Results]:")
    print(response_1['message']['content'].strip())
    print(f"⏱️  Latency: {metrics['p1_time']:.2f}s | 🪙  Tokens: {metrics['p1_in']} in -> {metrics['p1_out']} out\n")
    
    # --- PHASE 2: GENERATE PREFLIGHT CHECK ---
    conversation.append({"role": "assistant", "content": response_1['message']['content']})
    conversation.append({"role": "user", "content": "Excellent analysis. Now, generate the first exact bash command required to perform your preflight duplicate check."})
    
    start_time = time.time()
    response_2 = ollama.chat(model=model_name, messages=conversation, options={"temperature": 0.0})
    metrics["p2_time"] = time.time() - start_time
    metrics["p2_in"] = response_2.get('prompt_eval_count', 0)
    metrics["p2_out"] = response_2.get('eval_count', 0)
    p2_output = response_2['message']['content']
    
    print(f"🛠️  [PHASE 2: Preflight Command Generation]:")
    print(p2_output.strip())
    print(f"⏱️  Latency: {metrics['p2_time']:.2f}s | 🪙  Tokens: {metrics['p2_in']} in -> {metrics['p2_out']} out\n")
    
    # --- INTERCEPT & INJECT MOCK DATA ---
    mock_terminal_output = "[]"
    print(f"🌐 [MOCK TERMINAL EXECUTION]:")
    print(f"    Executing command generated above... returned: {mock_terminal_output} (0 events found)")
    
    # --- PHASE 3: GENERATE FINAL ACTION ---
    conversation.append({"role": "assistant", "content": p2_output})
    conversation.append({"role": "user", "content": f"The list command executed successfully and returned: {mock_terminal_output}. Generate the final bash command to create the meeting."})
    
    start_time = time.time()
    response_3 = ollama.chat(model=model_name, messages=conversation, options={"temperature": 0.0})
    metrics["p3_time"] = time.time() - start_time
    metrics["p3_in"] = response_3.get('prompt_eval_count', 0)
    metrics["p3_out"] = response_3.get('eval_count', 0)
    p3_output = response_3['message']['content']
    
    print(f"🏁 [PHASE 3: Final Execution Generation]:")
    print(p3_output.strip())
    print(f"⏱️  Latency: {metrics['p3_time']:.2f}s | 🪙  Tokens: {metrics['p3_in']} in -> {metrics['p3_out']} out\n")
    
    # --- COMPUTE COMPREHENSIVE PERFORMANCE AGGREGATIONS ---
    total_input = metrics["p1_in"] + metrics["p2_in"] + metrics["p3_in"]
    total_output = metrics["p1_out"] + metrics["p2_out"] + metrics["p3_out"]
    total_time = metrics["p1_time"] + metrics["p2_time"] + metrics["p3_time"]
    avg_tps = total_output / total_time
    
    print(f"📊 --- {model_name} METRIC AND EFFICIENCY ACCOUNTING ---")
    print(f"   📥 Total Input Context Evaluated: {total_input} tokens")
    print(f"   📤 Total Output Logic Generated:  {total_output} tokens")
    print(f"   ⏳ Combined Execution Latency:  {total_time:.2f} seconds")
    print(f"   ⚡ Measured Task Velocity:       {avg_tps:.2f} tokens/s")
    print(f"=============================================================\n")
    
    return {"model": model_name, "tps": avg_tps, "tokens": total_input + total_output}

 if __name__ == "__main__":
    test_models = ['llama3.1:latest','qwen3.5:latest','gemma4:e4b','llama3.1-tools:latest','qwen3.5-tools:latest','gemma4-tools:latest']
    summary_results = []
    
    for model in test_models:
        try:
            summary_results.append(run_unified_benchmark(model))
        except Exception as e:
            print(f"❌ Failed executing unified run for {model}: {e}")
	import time
	import json
	from datetime import datetime, timedelta
	import ollama

	# 1. ARRANGE: Define the full core skill environment
	GOOGLE_CALENDAR_SKILL = """
	You are an AI agent executing a Hermes skill. You must execute the correct CLI commands based on the rules below.

	---
	name: google-calendar-create
	description: "Add a new Google Calendar event via the existing Google Workspace skill."
	---
	## Rules
	1. Confirm before creating with the user when possible. Show event fields before executing the command.
	2. Use ISO 8601 with timezone offset for all start/end values. Example for WIB: 2026-01-18T09:00:00+07:00
	3. If the time is all day, omit the time and offset: 2026-01-18.
	4. Use --attendees only if the user provides an email list. Otherwise omit.
	6. Avoid duplicate events: before creating, list events for the same date/time window and only create if the event is not already present.

	## Usage
	GAPI="python $HOME/.hermes/skills/productivity/google-workspace/scripts/google_api.py"

	Create an event:
	$GAPI calendar create --summary "Standup" --start 2026-01-18T09:00:00+07:00 --end 2026-01-18T09:30:00+07:00

	Preflight check before creating events (Avoid duplicates by listing first):
	python3 .../google_api.py calendar list --start 2026-06-06T00:00:00+07:00 --end 2026-06-06T23:59:59+07:00

	Output format expected for execution: Output the exact bash command to run when requested.
	"""

	CURRENT_TIME = datetime.now()
	TOMORROW = CURRENT_TIME + timedelta(days=1)
	TOMORROW_DATE_STR = TOMORROW.strftime("%Y-%m-%d")

	def run_unified_benchmark(model_name):
	print(f"\n=============================================================")
	print(f"🔥 UNIFIED SKILL BENCHMARK: {model_name}")
	print(f"📅 Current System Time Context: {CURRENT_TIME.strftime('%Y-%m-%d %H:%M:%S')}")
	print(f"=============================================================")

	# Initialize metric accumulators
	metrics = {
	"p1_in": 0, "p1_out": 0, "p1_time": 0.0,
	"p2_in": 0, "p2_out": 0, "p2_time": 0.0,
	"p3_in": 0, "p3_out": 0, "p3_time": 0.0,
	}

	# --- PHASE 1: PROBE TASK REPRESENTATION & INTERNAL PLANNING ---
	conversation = [{"role": "system", "content": GOOGLE_CALENDAR_SKILL}]
	p1_prompt = (
	"Task: 'add meeting by 6 pm tomorrow'.\n\n"
	"Before writing any bash scripts, analyze this task. "
	"List the strict constraints from your instructions that apply, "
	"and explain your planned execution sequence."
	)
	conversation.append({"role": "user", "content": p1_prompt})

	start_time = time.time()
	response_1 = ollama.chat(model=model_name, messages=conversation, options={"temperature": 0.0})
	metrics["p1_time"] = time.time() - start_time
	metrics["p1_in"] = response_1.get('prompt_eval_count', 0)
	metrics["p1_out"] = response_1.get('eval_count', 0)

	print(f"🔍 [PHASE 1: Planning Probe Results]:")
	print(response_1['message']['content'].strip())
	print(f"⏱️ Latency: {metrics['p1_time']:.2f}s \| 🪙 Tokens: {metrics['p1_in']} in -> {metrics['p1_out']} out\n")

	# --- PHASE 2: GENERATE PREFLIGHT CHECK ---
	conversation.append({"role": "assistant", "content": response_1['message']['content']})
	conversation.append({"role": "user", "content": "Excellent analysis. Now, generate the first exact bash command required to perform your preflight duplicate check."})

	start_time = time.time()
	response_2 = ollama.chat(model=model_name, messages=conversation, options={"temperature": 0.0})
	metrics["p2_time"] = time.time() - start_time
	metrics["p2_in"] = response_2.get('prompt_eval_count', 0)
	metrics["p2_out"] = response_2.get('eval_count', 0)
	p2_output = response_2['message']['content']

	print(f"🛠️ [PHASE 2: Preflight Command Generation]:")
	print(p2_output.strip())
	print(f"⏱️ Latency: {metrics['p2_time']:.2f}s \| 🪙 Tokens: {metrics['p2_in']} in -> {metrics['p2_out']} out\n")

	# --- INTERCEPT & INJECT MOCK DATA ---
	mock_terminal_output = "[]"
	print(f"🌐 [MOCK TERMINAL EXECUTION]:")
	print(f" Executing command generated above... returned: {mock_terminal_output} (0 events found)")

	# --- PHASE 3: GENERATE FINAL ACTION ---
	conversation.append({"role": "assistant", "content": p2_output})
	conversation.append({"role": "user", "content": f"The list command executed successfully and returned: {mock_terminal_output}. Generate the final bash command to create the meeting."})

	start_time = time.time()
	response_3 = ollama.chat(model=model_name, messages=conversation, options={"temperature": 0.0})
	metrics["p3_time"] = time.time() - start_time
	metrics["p3_in"] = response_3.get('prompt_eval_count', 0)
	metrics["p3_out"] = response_3.get('eval_count', 0)
	p3_output = response_3['message']['content']

	print(f"🏁 [PHASE 3: Final Execution Generation]:")
	print(p3_output.strip())
	print(f"⏱️ Latency: {metrics['p3_time']:.2f}s \| 🪙 Tokens: {metrics['p3_in']} in -> {metrics['p3_out']} out\n")

	# --- COMPUTE COMPREHENSIVE PERFORMANCE AGGREGATIONS ---
	total_input = metrics["p1_in"] + metrics["p2_in"] + metrics["p3_in"]
	total_output = metrics["p1_out"] + metrics["p2_out"] + metrics["p3_out"]
	total_time = metrics["p1_time"] + metrics["p2_time"] + metrics["p3_time"]
	avg_tps = total_output / total_time

	print(f"📊 --- {model_name} METRIC AND EFFICIENCY ACCOUNTING ---")
	print(f" 📥 Total Input Context Evaluated: {total_input} tokens")
	print(f" 📤 Total Output Logic Generated: {total_output} tokens")
	print(f" ⏳ Combined Execution Latency: {total_time:.2f} seconds")
	print(f" ⚡ Measured Task Velocity: {avg_tps:.2f} tokens/s")
	print(f"=============================================================\n")

	return {"model": model_name, "tps": avg_tps, "tokens": total_input + total_output}

	if __name__ == "__main__":
	test_models = ['llama3.1:latest','qwen3.5:latest','gemma4:e4b','llama3.1-tools:latest','qwen3.5-tools:latest','gemma4-tools:latest']
	summary_results = []

	for model in test_models:
	try:
	summary_results.append(run_unified_benchmark(model))
	except Exception as e:
	print(f"❌ Failed executing unified run for {model}: {e}")
No results found