Created
December 13, 2024 09:45
-
-
Save tapadipti/a9409f4a496b456ac0caafe5cc01bda2 to your computer and use it in GitHub Desktop.
A Direct Integration of Anthropic Computer Use and AgentDesk
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import base64 | |
import io | |
import json | |
import anthropic | |
from agentdesk.device_v1 import Desktop | |
from typing import Any, Dict, List | |
import time | |
from copy import deepcopy | |
def capture_desktop_screenshot(desktop: Desktop) -> str: | |
"""Capture desktop screenshot and convert to base64.""" | |
screenshots = desktop.take_screenshots(count=1) | |
if not screenshots: | |
raise RuntimeError("Failed to capture screenshot") | |
buffer = io.BytesIO() | |
screenshots[0].save(buffer, format='PNG') | |
return base64.b64encode(buffer.getvalue()).decode('utf-8') | |
def run_desktop_action(desktop: Desktop, action: Dict[str, Any]) -> str: | |
"""Execute a desktop action based on the tool use response and return screenshot.""" | |
print(f"\nExecuting desktop action: {json.dumps(action, indent=2)}") | |
# Convert parameters | |
params = action.copy() | |
action_type = params.get("action") | |
# Handle coordinate conversion | |
if "coordinate" in params: | |
x, y = params["coordinate"] | |
params["x"] = x | |
params["y"] = y | |
del params["coordinate"] | |
# Handle key mapping | |
if "text" in params and action_type == "key": | |
params["keys"] = [params["text"]] | |
del params["text"] | |
# Execute the action using converted parameters | |
if action_type == "mouse_move": | |
desktop.move_mouse(x=params["x"], y=params["y"]) | |
elif action_type == "left_click": | |
desktop.click() | |
elif action_type == "left_click_drag": | |
desktop.drag_mouse(x=params["x"], y=params["y"]) | |
elif action_type == "double_click": | |
desktop.double_click() | |
elif action_type == "type": | |
desktop.type_text(params["text"]) | |
elif action_type == "key": | |
desktop.hot_key(params["keys"]) | |
elif action_type == "screenshot": | |
pass # Coz you'll take a screenshot in the end anyway, regardless of what the action is | |
elif action_type == "cursor_position": | |
coordinates = desktop.mouse_coordinates() | |
print(f"Current mouse coordinates: {coordinates}") | |
time.sleep(1) | |
return capture_desktop_screenshot(desktop) | |
def print_messages(messages: List[Dict[str, Any]]) -> None: | |
"""Print messages in a readable format, truncating base64 data.""" | |
messages_copy = deepcopy(messages) # Deep copy the messages | |
# Manually find and truncate any base64 image data | |
for message in messages_copy: | |
if message.get("content") and isinstance(message["content"], list): | |
for content in message["content"]: | |
if content.get("content") and isinstance(content["content"], list): | |
for item in content["content"]: | |
if isinstance(item, dict) and item.get("source", {}).get("data"): | |
data = item["source"]["data"] | |
item["source"]["data"] = data[:50] + "..." + data[-20:] | |
print("\nCurrent message history:") | |
print(json.dumps(messages_copy, indent=2)) | |
def main(task: str): | |
# Initialize Desktop | |
desktop = Desktop.docker() | |
desktop.view(background=True) | |
# Initialize Anthropic client | |
client = anthropic.Anthropic() | |
HELP_ANTHROPIC = f""" | |
You are utilizing an Linux virtual machine with internet access. | |
To open firefox, please just click on the web browser (globe) icon. | |
When using Firefox, if a startup wizard appears, IGNORE IT. Do not even click "skip this step". Instead, click on the address bar where it says "Search or enter address", and enter the appropriate search term or URL there. | |
""" | |
# Initial message | |
messages = [{ | |
"role": "user", | |
"content": f"{HELP_ANTHROPIC} Help me with this task: {task}." | |
}] | |
while True: | |
print("\n--- New iteration ---") | |
print_messages(messages) | |
print("\nSending request to Claude...") | |
response = client.beta.messages.create( | |
model="claude-3-5-sonnet-20241022", | |
max_tokens=1024, | |
tools=[ | |
{ | |
"type": "computer_20241022", | |
"name": "computer", | |
"display_width_px": 1024, | |
"display_height_px": 768, | |
"display_number": 1, | |
} | |
], | |
messages=messages, | |
betas=["computer-use-2024-10-22"] | |
) | |
print("\nClaude's response:") | |
print(f"{response}") | |
# The agent will return 'end_turn' if it believes it's finished | |
if response.stop_reason == "end_turn": | |
print(f"✅ I think the task is done, please review the result: {response.content[0].text}") | |
return | |
for content in response.content: | |
content_dict = content.model_dump() | |
print(json.dumps(content_dict, indent=2)) | |
if content.type == "text": | |
print(f"\nClaude's message: {content.text}") | |
elif content.type == "tool_use": | |
screenshot = run_desktop_action(desktop, content.input) | |
messages.extend([ | |
{ | |
"role": "assistant", | |
"content": [content.model_dump()] | |
}, | |
{ | |
"role": "user", | |
"content": [{ | |
"type": "tool_result", | |
"tool_use_id": content.id, | |
"content": [{ | |
"type": "image", | |
"source": { | |
"type": "base64", | |
"media_type": "image/png", | |
"data": screenshot | |
} | |
}] | |
}] | |
} | |
]) | |
if __name__ == "__main__": | |
if len(sys.argv) != 2: | |
print("Usage: python main.py 'your task description'") | |
sys.exit(1) | |
task = sys.argv[1] | |
main(task) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment