Created
June 6, 2026 03:01
-
-
Save omayib/c0161baa90e3b2d7bc0ccd249dff9eaf to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import time | |
| import json | |
| from datetime import datetime, timedelta | |
| import ollama | |
| # 1. ARRANGE: Define the full core skill environment | |
| GOOGLE_CALENDAR_SKILL = """ | |
| You are an AI agent executing a Hermes skill. You must execute the correct CLI commands based on the rules below. | |
| --- | |
| name: google-calendar-create | |
| description: "Add a new Google Calendar event via the existing Google Workspace skill." | |
| --- | |
| ## Rules | |
| 1. Confirm before creating with the user when possible. Show event fields before executing the command. | |
| 2. Use ISO 8601 with timezone offset for all start/end values. Example for WIB: 2026-01-18T09:00:00+07:00 | |
| 3. If the time is all day, omit the time and offset: 2026-01-18. | |
| 4. Use --attendees only if the user provides an email list. Otherwise omit. | |
| 6. Avoid duplicate events: before creating, list events for the same date/time window and only create if the event is not already present. | |
| ## Usage | |
| GAPI="python $HOME/.hermes/skills/productivity/google-workspace/scripts/google_api.py" | |
| Create an event: | |
| $GAPI calendar create --summary "Standup" --start 2026-01-18T09:00:00+07:00 --end 2026-01-18T09:30:00+07:00 | |
| Preflight check before creating events (Avoid duplicates by listing first): | |
| python3 .../google_api.py calendar list --start 2026-06-06T00:00:00+07:00 --end 2026-06-06T23:59:59+07:00 | |
| Output format expected for execution: Output the exact bash command to run when requested. | |
| """ | |
| CURRENT_TIME = datetime.now() | |
| TOMORROW = CURRENT_TIME + timedelta(days=1) | |
| TOMORROW_DATE_STR = TOMORROW.strftime("%Y-%m-%d") | |
| def run_unified_benchmark(model_name): | |
| print(f"\n=============================================================") | |
| print(f"π₯ UNIFIED SKILL BENCHMARK: {model_name}") | |
| print(f"π Current System Time Context: {CURRENT_TIME.strftime('%Y-%m-%d %H:%M:%S')}") | |
| print(f"=============================================================") | |
| # Initialize metric accumulators | |
| metrics = { | |
| "p1_in": 0, "p1_out": 0, "p1_time": 0.0, | |
| "p2_in": 0, "p2_out": 0, "p2_time": 0.0, | |
| "p3_in": 0, "p3_out": 0, "p3_time": 0.0, | |
| } | |
| # --- PHASE 1: PROBE TASK REPRESENTATION & INTERNAL PLANNING --- | |
| conversation = [{"role": "system", "content": GOOGLE_CALENDAR_SKILL}] | |
| p1_prompt = ( | |
| "Task: 'add meeting by 6 pm tomorrow'.\n\n" | |
| "Before writing any bash scripts, analyze this task. " | |
| "List the strict constraints from your instructions that apply, " | |
| "and explain your planned execution sequence." | |
| ) | |
| conversation.append({"role": "user", "content": p1_prompt}) | |
| start_time = time.time() | |
| response_1 = ollama.chat(model=model_name, messages=conversation, options={"temperature": 0.0}) | |
| metrics["p1_time"] = time.time() - start_time | |
| metrics["p1_in"] = response_1.get('prompt_eval_count', 0) | |
| metrics["p1_out"] = response_1.get('eval_count', 0) | |
| print(f"π [PHASE 1: Planning Probe Results]:") | |
| print(response_1['message']['content'].strip()) | |
| print(f"β±οΈ Latency: {metrics['p1_time']:.2f}s | πͺ Tokens: {metrics['p1_in']} in -> {metrics['p1_out']} out\n") | |
| # --- PHASE 2: GENERATE PREFLIGHT CHECK --- | |
| conversation.append({"role": "assistant", "content": response_1['message']['content']}) | |
| conversation.append({"role": "user", "content": "Excellent analysis. Now, generate the first exact bash command required to perform your preflight duplicate check."}) | |
| start_time = time.time() | |
| response_2 = ollama.chat(model=model_name, messages=conversation, options={"temperature": 0.0}) | |
| metrics["p2_time"] = time.time() - start_time | |
| metrics["p2_in"] = response_2.get('prompt_eval_count', 0) | |
| metrics["p2_out"] = response_2.get('eval_count', 0) | |
| p2_output = response_2['message']['content'] | |
| print(f"π οΈ [PHASE 2: Preflight Command Generation]:") | |
| print(p2_output.strip()) | |
| print(f"β±οΈ Latency: {metrics['p2_time']:.2f}s | πͺ Tokens: {metrics['p2_in']} in -> {metrics['p2_out']} out\n") | |
| # --- INTERCEPT & INJECT MOCK DATA --- | |
| mock_terminal_output = "[]" | |
| print(f"π [MOCK TERMINAL EXECUTION]:") | |
| print(f" Executing command generated above... returned: {mock_terminal_output} (0 events found)") | |
| # --- PHASE 3: GENERATE FINAL ACTION --- | |
| conversation.append({"role": "assistant", "content": p2_output}) | |
| conversation.append({"role": "user", "content": f"The list command executed successfully and returned: {mock_terminal_output}. Generate the final bash command to create the meeting."}) | |
| start_time = time.time() | |
| response_3 = ollama.chat(model=model_name, messages=conversation, options={"temperature": 0.0}) | |
| metrics["p3_time"] = time.time() - start_time | |
| metrics["p3_in"] = response_3.get('prompt_eval_count', 0) | |
| metrics["p3_out"] = response_3.get('eval_count', 0) | |
| p3_output = response_3['message']['content'] | |
| print(f"π [PHASE 3: Final Execution Generation]:") | |
| print(p3_output.strip()) | |
| print(f"β±οΈ Latency: {metrics['p3_time']:.2f}s | πͺ Tokens: {metrics['p3_in']} in -> {metrics['p3_out']} out\n") | |
| # --- COMPUTE COMPREHENSIVE PERFORMANCE AGGREGATIONS --- | |
| total_input = metrics["p1_in"] + metrics["p2_in"] + metrics["p3_in"] | |
| total_output = metrics["p1_out"] + metrics["p2_out"] + metrics["p3_out"] | |
| total_time = metrics["p1_time"] + metrics["p2_time"] + metrics["p3_time"] | |
| avg_tps = total_output / total_time | |
| print(f"π --- {model_name} METRIC AND EFFICIENCY ACCOUNTING ---") | |
| print(f" π₯ Total Input Context Evaluated: {total_input} tokens") | |
| print(f" π€ Total Output Logic Generated: {total_output} tokens") | |
| print(f" β³ Combined Execution Latency: {total_time:.2f} seconds") | |
| print(f" β‘ Measured Task Velocity: {avg_tps:.2f} tokens/s") | |
| print(f"=============================================================\n") | |
| return {"model": model_name, "tps": avg_tps, "tokens": total_input + total_output} | |
| if __name__ == "__main__": | |
| test_models = ['llama3.1:latest','qwen3.5:latest','gemma4:e4b','llama3.1-tools:latest','qwen3.5-tools:latest','gemma4-tools:latest'] | |
| summary_results = [] | |
| for model in test_models: | |
| try: | |
| summary_results.append(run_unified_benchmark(model)) | |
| except Exception as e: | |
| print(f"β Failed executing unified run for {model}: {e}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment