Skip to content

Instantly share code, notes, and snippets.

@tapadipti
Created December 13, 2024 09:45
Show Gist options
  • Save tapadipti/a9409f4a496b456ac0caafe5cc01bda2 to your computer and use it in GitHub Desktop.
Save tapadipti/a9409f4a496b456ac0caafe5cc01bda2 to your computer and use it in GitHub Desktop.
A Direct Integration of Anthropic Computer Use and AgentDesk
import sys
import base64
import io
import json
import anthropic
from agentdesk.device_v1 import Desktop
from typing import Any, Dict, List
import time
from copy import deepcopy
def capture_desktop_screenshot(desktop: Desktop) -> str:
"""Capture desktop screenshot and convert to base64."""
screenshots = desktop.take_screenshots(count=1)
if not screenshots:
raise RuntimeError("Failed to capture screenshot")
buffer = io.BytesIO()
screenshots[0].save(buffer, format='PNG')
return base64.b64encode(buffer.getvalue()).decode('utf-8')
def run_desktop_action(desktop: Desktop, action: Dict[str, Any]) -> str:
"""Execute a desktop action based on the tool use response and return screenshot."""
print(f"\nExecuting desktop action: {json.dumps(action, indent=2)}")
# Convert parameters
params = action.copy()
action_type = params.get("action")
# Handle coordinate conversion
if "coordinate" in params:
x, y = params["coordinate"]
params["x"] = x
params["y"] = y
del params["coordinate"]
# Handle key mapping
if "text" in params and action_type == "key":
params["keys"] = [params["text"]]
del params["text"]
# Execute the action using converted parameters
if action_type == "mouse_move":
desktop.move_mouse(x=params["x"], y=params["y"])
elif action_type == "left_click":
desktop.click()
elif action_type == "left_click_drag":
desktop.drag_mouse(x=params["x"], y=params["y"])
elif action_type == "double_click":
desktop.double_click()
elif action_type == "type":
desktop.type_text(params["text"])
elif action_type == "key":
desktop.hot_key(params["keys"])
elif action_type == "screenshot":
pass # Coz you'll take a screenshot in the end anyway, regardless of what the action is
elif action_type == "cursor_position":
coordinates = desktop.mouse_coordinates()
print(f"Current mouse coordinates: {coordinates}")
time.sleep(1)
return capture_desktop_screenshot(desktop)
def print_messages(messages: List[Dict[str, Any]]) -> None:
"""Print messages in a readable format, truncating base64 data."""
messages_copy = deepcopy(messages) # Deep copy the messages
# Manually find and truncate any base64 image data
for message in messages_copy:
if message.get("content") and isinstance(message["content"], list):
for content in message["content"]:
if content.get("content") and isinstance(content["content"], list):
for item in content["content"]:
if isinstance(item, dict) and item.get("source", {}).get("data"):
data = item["source"]["data"]
item["source"]["data"] = data[:50] + "..." + data[-20:]
print("\nCurrent message history:")
print(json.dumps(messages_copy, indent=2))
def main(task: str):
# Initialize Desktop
desktop = Desktop.docker()
desktop.view(background=True)
# Initialize Anthropic client
client = anthropic.Anthropic()
HELP_ANTHROPIC = f"""
You are utilizing an Linux virtual machine with internet access.
To open firefox, please just click on the web browser (globe) icon.
When using Firefox, if a startup wizard appears, IGNORE IT. Do not even click "skip this step". Instead, click on the address bar where it says "Search or enter address", and enter the appropriate search term or URL there.
"""
# Initial message
messages = [{
"role": "user",
"content": f"{HELP_ANTHROPIC} Help me with this task: {task}."
}]
while True:
print("\n--- New iteration ---")
print_messages(messages)
print("\nSending request to Claude...")
response = client.beta.messages.create(
model="claude-3-5-sonnet-20241022",
max_tokens=1024,
tools=[
{
"type": "computer_20241022",
"name": "computer",
"display_width_px": 1024,
"display_height_px": 768,
"display_number": 1,
}
],
messages=messages,
betas=["computer-use-2024-10-22"]
)
print("\nClaude's response:")
print(f"{response}")
# The agent will return 'end_turn' if it believes it's finished
if response.stop_reason == "end_turn":
print(f"✅ I think the task is done, please review the result: {response.content[0].text}")
return
for content in response.content:
content_dict = content.model_dump()
print(json.dumps(content_dict, indent=2))
if content.type == "text":
print(f"\nClaude's message: {content.text}")
elif content.type == "tool_use":
screenshot = run_desktop_action(desktop, content.input)
messages.extend([
{
"role": "assistant",
"content": [content.model_dump()]
},
{
"role": "user",
"content": [{
"type": "tool_result",
"tool_use_id": content.id,
"content": [{
"type": "image",
"source": {
"type": "base64",
"media_type": "image/png",
"data": screenshot
}
}]
}]
}
])
if __name__ == "__main__":
if len(sys.argv) != 2:
print("Usage: python main.py 'your task description'")
sys.exit(1)
task = sys.argv[1]
main(task)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment