tapadipti · December 13, 2024 09:45
diff --git a/acu_agentdesk_ desktop_agent.py b/acu_agentdesk_ desktop_agent.py
 import sys
 import base64
 import io
 import json
 import anthropic
 from agentdesk.device_v1 import Desktop
 from typing import Any, Dict, List
 import time
 from copy import deepcopy

 def capture_desktop_screenshot(desktop: Desktop) -> str:
    """Capture desktop screenshot and convert to base64."""
    screenshots = desktop.take_screenshots(count=1)
    if not screenshots:
        raise RuntimeError("Failed to capture screenshot")
    
    buffer = io.BytesIO()
    screenshots[0].save(buffer, format='PNG')
    return base64.b64encode(buffer.getvalue()).decode('utf-8')

 def run_desktop_action(desktop: Desktop, action: Dict[str, Any]) -> str:
    """Execute a desktop action based on the tool use response and return screenshot."""
    print(f"\nExecuting desktop action: {json.dumps(action, indent=2)}")
    
    # Convert parameters
    params = action.copy()
    action_type = params.get("action")
    
    # Handle coordinate conversion
    if "coordinate" in params:
        x, y = params["coordinate"]
        params["x"] = x
        params["y"] = y
        del params["coordinate"]
        
    # Handle key mapping
    if "text" in params and action_type == "key":
        params["keys"] = [params["text"]]
        del params["text"]
    
    # Execute the action using converted parameters
    if action_type == "mouse_move":
        desktop.move_mouse(x=params["x"], y=params["y"])
    elif action_type == "left_click":
        desktop.click()
    elif action_type == "left_click_drag":
        desktop.drag_mouse(x=params["x"], y=params["y"])
    elif action_type == "double_click":
        desktop.double_click()
    elif action_type == "type":
        desktop.type_text(params["text"])
    elif action_type == "key":
        desktop.hot_key(params["keys"])
    elif action_type == "screenshot":
        pass # Coz you'll take a screenshot in the end anyway, regardless of what the action is
    elif action_type == "cursor_position":
        coordinates = desktop.mouse_coordinates()
        print(f"Current mouse coordinates: {coordinates}")
    
    time.sleep(1)
    return capture_desktop_screenshot(desktop)

 def print_messages(messages: List[Dict[str, Any]]) -> None:
    """Print messages in a readable format, truncating base64 data."""
    messages_copy = deepcopy(messages)  # Deep copy the messages
    
    # Manually find and truncate any base64 image data
    for message in messages_copy:
        if message.get("content") and isinstance(message["content"], list):
            for content in message["content"]:
                if content.get("content") and isinstance(content["content"], list):
                    for item in content["content"]:
                        if isinstance(item, dict) and item.get("source", {}).get("data"):
                            data = item["source"]["data"]
                            item["source"]["data"] = data[:50] + "..." + data[-20:]
    
    print("\nCurrent message history:")
    print(json.dumps(messages_copy, indent=2))

 def main(task: str):
    # Initialize Desktop
    desktop = Desktop.docker()
    desktop.view(background=True)

    # Initialize Anthropic client
    client = anthropic.Anthropic()
    
    HELP_ANTHROPIC = f"""
    You are utilizing an Linux virtual machine with internet access.
    To open firefox, please just click on the web browser (globe) icon.
    When using Firefox, if a startup wizard appears, IGNORE IT.  Do not even click "skip this step".  Instead, click on the address bar where it says "Search or enter address", and enter the appropriate search term or URL there.
    """

    # Initial message
    messages = [{
        "role": "user",
        "content": f"{HELP_ANTHROPIC} Help me with this task: {task}."
    }]
    
    while True:
        print("\n--- New iteration ---")
        print_messages(messages)
        
        print("\nSending request to Claude...")
        response = client.beta.messages.create(
            model="claude-3-5-sonnet-20241022",
            max_tokens=1024,
            tools=[
                {
                    "type": "computer_20241022",
                    "name": "computer",
                    "display_width_px": 1024,
                    "display_height_px": 768,
                    "display_number": 1,
                }
            ],
            messages=messages,
            betas=["computer-use-2024-10-22"]
        )
        
        print("\nClaude's response:")
        print(f"{response}")

        # The agent will return 'end_turn' if it believes it's finished
        if response.stop_reason == "end_turn":
            print(f"✅ I think the task is done, please review the result: {response.content[0].text}")
            return
    
        for content in response.content:
            content_dict = content.model_dump()
            print(json.dumps(content_dict, indent=2))
            
            if content.type == "text":
                print(f"\nClaude's message: {content.text}")

            elif content.type == "tool_use":
                screenshot = run_desktop_action(desktop, content.input)                
                messages.extend([
                    {
                        "role": "assistant",
                        "content": [content.model_dump()]
                    },
                    {
                        "role": "user",
                        "content": [{
                            "type": "tool_result",
                            "tool_use_id": content.id,
                            "content": [{
                                "type": "image",
                                "source": {
                                    "type": "base64",
                                    "media_type": "image/png",
                                    "data": screenshot
                                }
                            }]
                        }]
                    }
                ])

 if __name__ == "__main__":
    if len(sys.argv) != 2:
        print("Usage: python main.py 'your task description'")
        sys.exit(1)
        
    task = sys.argv[1]
    main(task)
	import sys
	import base64
	import io
	import json
	import anthropic
	from agentdesk.device_v1 import Desktop
	from typing import Any, Dict, List
	import time
	from copy import deepcopy

	def capture_desktop_screenshot(desktop: Desktop) -> str:
	"""Capture desktop screenshot and convert to base64."""
	screenshots = desktop.take_screenshots(count=1)
	if not screenshots:
	raise RuntimeError("Failed to capture screenshot")

	buffer = io.BytesIO()
	screenshots[0].save(buffer, format='PNG')
	return base64.b64encode(buffer.getvalue()).decode('utf-8')

	def run_desktop_action(desktop: Desktop, action: Dict[str, Any]) -> str:
	"""Execute a desktop action based on the tool use response and return screenshot."""
	print(f"\nExecuting desktop action: {json.dumps(action, indent=2)}")

	# Convert parameters
	params = action.copy()
	action_type = params.get("action")

	# Handle coordinate conversion
	if "coordinate" in params:
	x, y = params["coordinate"]
	params["x"] = x
	params["y"] = y
	del params["coordinate"]

	# Handle key mapping
	if "text" in params and action_type == "key":
	params["keys"] = [params["text"]]
	del params["text"]

	# Execute the action using converted parameters
	if action_type == "mouse_move":
	desktop.move_mouse(x=params["x"], y=params["y"])
	elif action_type == "left_click":
	desktop.click()
	elif action_type == "left_click_drag":
	desktop.drag_mouse(x=params["x"], y=params["y"])
	elif action_type == "double_click":
	desktop.double_click()
	elif action_type == "type":
	desktop.type_text(params["text"])
	elif action_type == "key":
	desktop.hot_key(params["keys"])
	elif action_type == "screenshot":
	pass # Coz you'll take a screenshot in the end anyway, regardless of what the action is
	elif action_type == "cursor_position":
	coordinates = desktop.mouse_coordinates()
	print(f"Current mouse coordinates: {coordinates}")

	time.sleep(1)
	return capture_desktop_screenshot(desktop)

	def print_messages(messages: List[Dict[str, Any]]) -> None:
	"""Print messages in a readable format, truncating base64 data."""
	messages_copy = deepcopy(messages) # Deep copy the messages

	# Manually find and truncate any base64 image data
	for message in messages_copy:
	if message.get("content") and isinstance(message["content"], list):
	for content in message["content"]:
	if content.get("content") and isinstance(content["content"], list):
	for item in content["content"]:
	if isinstance(item, dict) and item.get("source", {}).get("data"):
	data = item["source"]["data"]
	item["source"]["data"] = data[:50] + "..." + data[-20:]

	print("\nCurrent message history:")
	print(json.dumps(messages_copy, indent=2))

	def main(task: str):
	# Initialize Desktop
	desktop = Desktop.docker()
	desktop.view(background=True)

	# Initialize Anthropic client
	client = anthropic.Anthropic()

	HELP_ANTHROPIC = f"""
	You are utilizing an Linux virtual machine with internet access.
	To open firefox, please just click on the web browser (globe) icon.
	When using Firefox, if a startup wizard appears, IGNORE IT. Do not even click "skip this step". Instead, click on the address bar where it says "Search or enter address", and enter the appropriate search term or URL there.
	"""

	# Initial message
	messages = [{
	"role": "user",
	"content": f"{HELP_ANTHROPIC} Help me with this task: {task}."
	}]

	while True:
	print("\n--- New iteration ---")
	print_messages(messages)

	print("\nSending request to Claude...")
	response = client.beta.messages.create(
	model="claude-3-5-sonnet-20241022",
	max_tokens=1024,
	tools=[
	{
	"type": "computer_20241022",
	"name": "computer",
	"display_width_px": 1024,
	"display_height_px": 768,
	"display_number": 1,
	}
	],
	messages=messages,
	betas=["computer-use-2024-10-22"]
	)

	print("\nClaude's response:")
	print(f"{response}")

	# The agent will return 'end_turn' if it believes it's finished
	if response.stop_reason == "end_turn":
	print(f"✅ I think the task is done, please review the result: {response.content[0].text}")
	return

	for content in response.content:
	content_dict = content.model_dump()
	print(json.dumps(content_dict, indent=2))

	if content.type == "text":
	print(f"\nClaude's message: {content.text}")

	elif content.type == "tool_use":
	screenshot = run_desktop_action(desktop, content.input)
	messages.extend([
	{
	"role": "assistant",
	"content": [content.model_dump()]
	},
	{
	"role": "user",
	"content": [{
	"type": "tool_result",
	"tool_use_id": content.id,
	"content": [{
	"type": "image",
	"source": {
	"type": "base64",
	"media_type": "image/png",
	"data": screenshot
	}
	}]
	}]
	}
	])

	if __name__ == "__main__":
	if len(sys.argv) != 2:
	print("Usage: python main.py 'your task description'")
	sys.exit(1)

	task = sys.argv[1]
	main(task)