Skip to content

Instantly share code, notes, and snippets.

@omayib
Created June 6, 2026 03:01
Show Gist options
  • Select an option

  • Save omayib/c0161baa90e3b2d7bc0ccd249dff9eaf to your computer and use it in GitHub Desktop.

Select an option

Save omayib/c0161baa90e3b2d7bc0ccd249dff9eaf to your computer and use it in GitHub Desktop.
import time
import json
from datetime import datetime, timedelta
import ollama
# 1. ARRANGE: Define the full core skill environment
GOOGLE_CALENDAR_SKILL = """
You are an AI agent executing a Hermes skill. You must execute the correct CLI commands based on the rules below.
---
name: google-calendar-create
description: "Add a new Google Calendar event via the existing Google Workspace skill."
---
## Rules
1. Confirm before creating with the user when possible. Show event fields before executing the command.
2. Use ISO 8601 with timezone offset for all start/end values. Example for WIB: 2026-01-18T09:00:00+07:00
3. If the time is all day, omit the time and offset: 2026-01-18.
4. Use --attendees only if the user provides an email list. Otherwise omit.
6. Avoid duplicate events: before creating, list events for the same date/time window and only create if the event is not already present.
## Usage
GAPI="python $HOME/.hermes/skills/productivity/google-workspace/scripts/google_api.py"
Create an event:
$GAPI calendar create --summary "Standup" --start 2026-01-18T09:00:00+07:00 --end 2026-01-18T09:30:00+07:00
Preflight check before creating events (Avoid duplicates by listing first):
python3 .../google_api.py calendar list --start 2026-06-06T00:00:00+07:00 --end 2026-06-06T23:59:59+07:00
Output format expected for execution: Output the exact bash command to run when requested.
"""
CURRENT_TIME = datetime.now()
TOMORROW = CURRENT_TIME + timedelta(days=1)
TOMORROW_DATE_STR = TOMORROW.strftime("%Y-%m-%d")
def run_unified_benchmark(model_name):
print(f"\n=============================================================")
print(f"πŸ”₯ UNIFIED SKILL BENCHMARK: {model_name}")
print(f"πŸ“… Current System Time Context: {CURRENT_TIME.strftime('%Y-%m-%d %H:%M:%S')}")
print(f"=============================================================")
# Initialize metric accumulators
metrics = {
"p1_in": 0, "p1_out": 0, "p1_time": 0.0,
"p2_in": 0, "p2_out": 0, "p2_time": 0.0,
"p3_in": 0, "p3_out": 0, "p3_time": 0.0,
}
# --- PHASE 1: PROBE TASK REPRESENTATION & INTERNAL PLANNING ---
conversation = [{"role": "system", "content": GOOGLE_CALENDAR_SKILL}]
p1_prompt = (
"Task: 'add meeting by 6 pm tomorrow'.\n\n"
"Before writing any bash scripts, analyze this task. "
"List the strict constraints from your instructions that apply, "
"and explain your planned execution sequence."
)
conversation.append({"role": "user", "content": p1_prompt})
start_time = time.time()
response_1 = ollama.chat(model=model_name, messages=conversation, options={"temperature": 0.0})
metrics["p1_time"] = time.time() - start_time
metrics["p1_in"] = response_1.get('prompt_eval_count', 0)
metrics["p1_out"] = response_1.get('eval_count', 0)
print(f"πŸ” [PHASE 1: Planning Probe Results]:")
print(response_1['message']['content'].strip())
print(f"⏱️ Latency: {metrics['p1_time']:.2f}s | πŸͺ™ Tokens: {metrics['p1_in']} in -> {metrics['p1_out']} out\n")
# --- PHASE 2: GENERATE PREFLIGHT CHECK ---
conversation.append({"role": "assistant", "content": response_1['message']['content']})
conversation.append({"role": "user", "content": "Excellent analysis. Now, generate the first exact bash command required to perform your preflight duplicate check."})
start_time = time.time()
response_2 = ollama.chat(model=model_name, messages=conversation, options={"temperature": 0.0})
metrics["p2_time"] = time.time() - start_time
metrics["p2_in"] = response_2.get('prompt_eval_count', 0)
metrics["p2_out"] = response_2.get('eval_count', 0)
p2_output = response_2['message']['content']
print(f"πŸ› οΈ [PHASE 2: Preflight Command Generation]:")
print(p2_output.strip())
print(f"⏱️ Latency: {metrics['p2_time']:.2f}s | πŸͺ™ Tokens: {metrics['p2_in']} in -> {metrics['p2_out']} out\n")
# --- INTERCEPT & INJECT MOCK DATA ---
mock_terminal_output = "[]"
print(f"🌐 [MOCK TERMINAL EXECUTION]:")
print(f" Executing command generated above... returned: {mock_terminal_output} (0 events found)")
# --- PHASE 3: GENERATE FINAL ACTION ---
conversation.append({"role": "assistant", "content": p2_output})
conversation.append({"role": "user", "content": f"The list command executed successfully and returned: {mock_terminal_output}. Generate the final bash command to create the meeting."})
start_time = time.time()
response_3 = ollama.chat(model=model_name, messages=conversation, options={"temperature": 0.0})
metrics["p3_time"] = time.time() - start_time
metrics["p3_in"] = response_3.get('prompt_eval_count', 0)
metrics["p3_out"] = response_3.get('eval_count', 0)
p3_output = response_3['message']['content']
print(f"🏁 [PHASE 3: Final Execution Generation]:")
print(p3_output.strip())
print(f"⏱️ Latency: {metrics['p3_time']:.2f}s | πŸͺ™ Tokens: {metrics['p3_in']} in -> {metrics['p3_out']} out\n")
# --- COMPUTE COMPREHENSIVE PERFORMANCE AGGREGATIONS ---
total_input = metrics["p1_in"] + metrics["p2_in"] + metrics["p3_in"]
total_output = metrics["p1_out"] + metrics["p2_out"] + metrics["p3_out"]
total_time = metrics["p1_time"] + metrics["p2_time"] + metrics["p3_time"]
avg_tps = total_output / total_time
print(f"πŸ“Š --- {model_name} METRIC AND EFFICIENCY ACCOUNTING ---")
print(f" πŸ“₯ Total Input Context Evaluated: {total_input} tokens")
print(f" πŸ“€ Total Output Logic Generated: {total_output} tokens")
print(f" ⏳ Combined Execution Latency: {total_time:.2f} seconds")
print(f" ⚑ Measured Task Velocity: {avg_tps:.2f} tokens/s")
print(f"=============================================================\n")
return {"model": model_name, "tps": avg_tps, "tokens": total_input + total_output}
if __name__ == "__main__":
test_models = ['llama3.1:latest','qwen3.5:latest','gemma4:e4b','llama3.1-tools:latest','qwen3.5-tools:latest','gemma4-tools:latest']
summary_results = []
for model in test_models:
try:
summary_results.append(run_unified_benchmark(model))
except Exception as e:
print(f"❌ Failed executing unified run for {model}: {e}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment