Skip to content

Instantly share code, notes, and snippets.

@moodymudskipper
Last active November 5, 2024 14:15
Show Gist options
  • Save moodymudskipper/f5fcbd23859c65f364b2d5979b284fd2 to your computer and use it in GitHub Desktop.
Save moodymudskipper/f5fcbd23859c65f364b2d5979b284fd2 to your computer and use it in GitHub Desktop.
computer use
apple_screenshot <- function(file = tempfile(fileext = ".png")) {
cmd <- sprintf("screencapture -xC %s", file)
system(cmd)
invisible(file)
}
apple_keystroke <- function(x) {
cmd <- sprintf(
"osascript -e 'tell application \"System Events\" to keystroke \"%s\"'",
x
)
system(cmd)
}
# hotkey is in the format "control shft command opt f D"
# note about numbered keys, this doesn't work apple_hotkey("shft cmd 4")
# but this does apple_hotkey("cmd ç")
apple_hotkey <- function(hotkey) {
hotkey <- tolower(gsub("[ +]+", " ", hotkey))
hotkey_split <- strsplit(hotkey, " ")[[1]]
modifiers <- head(hotkey_split, -1)
key <- tail(hotkey_split, 1)
n <- length(hotkey_split)
key_map <- c(
"F1" = 122, "F2" = 120, "F3" = 99, "F4" = 118, "F5" = 96, "F6" = 97,
"F7" = 98, "F8" = 100, "F9" = 101, "F10" = 109, "F11" = 103, "F12" = 111,
"esc" = 53, "enter" = 36, "return" = 36, "delete" = 51, "backspace" = 51,
"space" = 49, "tab" = 48, "left" = 123, "right" = 124, "down" = 125, "up" = 126,
"home" = 115, "end" = 119, "page_down" = 121, "page_up" = 116
)
if (n > 1) {
modifiers <- match.arg(modifiers, c("control", "command", "super", "shift", "fn", "option", "cmd", "shft", "ctrl", "alt"), several.ok = TRUE)
modifiers[modifiers == "cmd"] <- "command"
modifiers[modifiers == "ctrl"] <- "control"
modifiers[modifiers == "shft"] <- "shift"
modifiers[modifiers == "alt"] <- "option"
modifiers[modifiers == "super"] <- "command"
if (nchar(key) == 1) {
cmd <- sprintf(
'osascript -e \'tell application "System Events" to keystroke "%s" using {%s}\'',
key,
toString(paste(modifiers, "down"))
)
} else {
key <- key_map[[key]]
cmd <- sprintf(
'osascript -e \'tell application "System Events" to key code %s using {%s}\'',
key,
toString(paste(modifiers, "down"))
)
}
} else {
if (nchar(key) == 1) {
cmd <- sprintf(
'osascript -e \'tell application "System Events" to keystroke "%s"\'',
key
)
} else {
key <- key_map[[key]]
cmd <- sprintf(
'osascript -e \'tell application "System Events" to key code %s\'',
key
)
}
}
# Execute the AppleScript command
system(cmd)
}
apple_move_cursor <- function(x, y) {
cmd <- sprintf("cliclick m:%d,%d", x, y)
system(cmd)
}
# Single left click at the current mouse position or at given coordinates
apple_left_click <- function(x = NULL, y = NULL) {
cmd <- "cliclick p:"
position <- system(cmd, intern = TRUE)
coords <- unlist(strsplit(position, ","))
if (is.null(x)) x <- coords[[1]]
if (is.null(y)) y <- coords[[2]]
cmd <- sprintf("cliclick c:%s,%s", x, y)
system(cmd)
}
# Single right click at the current mouse position or at given coordinates
apple_right_click <- function(x = NULL, y = NULL) {
cmd <- "cliclick p:"
position <- system(cmd, intern = TRUE)
coords <- unlist(strsplit(position, ","))
if (is.null(x)) x <- coords[[1]]
if (is.null(y)) y <- coords[[2]]
cmd <- sprintf("cliclick rc:%s,%s", x, y)
system(cmd)
}
# Double left click at the current mouse position or at given coordinates
apple_double_left_click <- function(x = NULL, y = NULL) {
cmd <- "cliclick p:"
position <- system(cmd, intern = TRUE)
coords <- unlist(strsplit(position, ","))
if (is.null(x)) x <- coords[[1]]
if (is.null(y)) y <- coords[[2]]
cmd <- sprintf("cliclick dc:%s,%s", x, y)
system(cmd)
}
apple_get_cursor_position <- function() {
cmd <- "cliclick p:"
position <- system(cmd, intern = TRUE)
coords <- unlist(strsplit(position, ","))
list(x = as.integer(coords[1]), y = as.integer(coords[2]))
}
apple_open_app <- function(app, file = NULL) {
if (is.null(file)) {
cmd <- sprintf("open -a \"%s\"", app)
} else {
cmd <- sprintf("open -a \"%s\" \"%s\"", app, file)
}
system(cmd)
}
library(httr)
library(jsonlite)
claude_computer_use_impl <- function(messages) {
api_url <- "https://api.anthropic.com/v1/messages" # Claude API endpoint
api_key <- Sys.getenv("ANTHROPIC_API_KEY")
headers <- httr::add_headers(
"x-api-key" = api_key,
"content-type" = "application/json",
"anthropic-version" = "2023-06-01",
"anthropic-beta" = "computer-use-2024-10-22"
)
tools <- list(
list(
type = "computer_20241022",
name = "computer",
# tweak
display_width_px = 1440,
display_height_px = 876,
display_number = 1
),
list(
type = "text_editor_20241022",
name = "str_replace_editor"
),
list(
type = "bash_20241022",
name = "bash"
)
)
# tweak
system <- paste(
sep = "\n",
"<SYSTEM_CAPABILITY>",
"* You are utilising a MacBook Air 4. with internet access.",
"* Using bash tool you can start GUI applications, but you need to set export DISPLAY=:1 and use a subshell. For example \"(DISPLAY=:1 xterm &)\". GUI apps run with bash tool will appear within your desktop environment, but they may take some time to appear. Take a screenshot to confirm it did.",
"* When using your bash tool with commands that are expected to output very large quantities of text, redirect into a tmp file and use str_replace_editor or `grep -n -B <lines before> -A <lines after> <query> <filename>` to confirm output.",
"* When viewing a page it can be helpful to zoom out so that you can see everything on the page. Either that, or make sure you scroll down to see everything before deciding something isn't available.",
"* When using your computer function calls, they take a while to run and send back to you. Where possible/feasible, try to chain multiple of these calls all into one function calls request.",
"* The current date is {datetime.today().strftime('%A, %B %-d, %Y')}.",
"* In particular apps can be opened with `open -a \"APPNAME\"` or `open -a \"APPNAME\" \"FILE\"`",
"* Use Firefox for browsing, or the specific browser the user requests you to use.",
"* To save a picture, right click on it, then press the 'q' key to select 'Save Image As', then press enter to enter the save as dialog.",
"* Ctrl Command F can be used to maximize or minimize the active window.",
"* Whenever you use a hotkey, make sure that the hotkey is relevant to the active app, and describe what it's meant to achieve in your message.",
"</SYSTEM_CAPABILITY>",
"",
"<IMPORTANT>",
"* When using a web browser, if a startup wizard appears, IGNORE IT. Do not even click \"skip this step\". Instead, click on the address bar where it says \"Search or enter address\", and enter the appropriate search term or URL there.",
"* If the item you are looking at is a pdf, if after taking a single screenshot of the pdf it seems that you want to read the entire document instead of trying to continue to read the pdf from your screenshots + navigation, determine the URL, use curl to download the pdf, install and use pdftotext to convert it to a text file, and then read that text file directly with your StrReplaceEditTool.",
"</IMPORTANT>"
)
# Payload setup
body <- list(
model = "claude-3-5-sonnet-20241022",
max_tokens = 1024,
tools = tools,
messages = messages,
system = system
)
# API request
response <- httr::POST(
api_url,
headers,
body = jsonlite::toJSON(body, auto_unbox = TRUE),
encode = "json"
)
# Process and return response
result <- httr::content(response, "text")
#parsed_result <- jsonlite::fromJSON(result, flatten = FALSE)
parsed_result <- jsonlite::parse_json(result)
return(parsed_result)
}
append_to_last_message <- function(messages, content, image = NULL) {
if (is.null(image)) {
messages[[length(messages)]]$content <-
append(
messages[[length(messages)]]$content,
list(list(
type = "tool_result",
tool_use_id = content$id,
is_error = FALSE
))
)
} else {
image <- base64enc::base64encode(image)
messages[[length(messages)]]$content <-
append(
messages[[length(messages)]]$content,
list(list(
type = "tool_result",
content = list(list(
type = "image",
source = list(
type = "base64",
media_type = "image/png",
data = image
)
)),
tool_use_id = content$id,
is_error = FALSE
))
)
}
messages
}
claude_computer_use <- function(prompt) {
messages <- list(
list(
role = "user",
content = prompt
)
)
repeat {
message(sprintf(
"message (%s) --------------------------------------------------------",
length(messages) + 1
))
repeat {
res <- claude_computer_use_impl(messages)
if (res$type == "error" && res$error$type == "overloaded_error") {
rlang::inform(c(sprintf("%s: %s", res$error$type, res$error$message), i = "we'll wait 5 sec and try again"))
Sys.sleep(5)
} else {
break
}
}
if (res$type == "error") {
messages <<- messages
abort(sprintf("%s: %s", res$error$type, res$error$message))
}
# fetch text response, usually it's in the first element, sometimes absent
# I've never seen text out of the 1st element but we prefer to be general
text <- unlist(lapply(res$content, function(x) x$text))
if (!length(text)) text <- "*no text in response*"
writeLines(text)
messages <- append(
messages,
list(list(
role = "assistant",
content = res$content
))
)
stop_reason <- res$stop_reason
if (!length(stop_reason)) browser()
if (stop_reason == "tool_use") {
messages <- append(messages, list(list(role = "user", content = list())))
for (content in res$content) {
if (content$type != "tool_use") next
name <- content$name
action <- content$input$action
image <- NULL
if (name == "computer" && action == "screenshot") {
message("* Taking a screenshot")
image <- apple_screenshot("test.png")
messages <- append_to_last_message(messages, content, image = image)
} else if (name == "computer" && action == "key") {
message(sprintf("* triggering hotkey: %s", content$input$text))
apple_hotkey(content$input$text)
messages <- append_to_last_message(messages, content, image = image)
} else if (name == "computer" && action == "type") {
message(sprintf("* typing: %s", content$input$text))
apple_keystroke(content$input$text)
messages <- append_to_last_message(messages, content)
} else if (name == "computer" && action == "mouse_move") {
x <- content$input$coordinate[[1]]
y <- content$input$coordinate[[2]]
# from trial and error
x <- round(1.04 * x) # tweak
y <- round(1.04 * y) # tweak
content$input$coordinate[[1]] <- x
content$input$coordinate[[2]] <- y
message(sprintf("* Moving cursor to: %s, %s", x, y))
apple_move_cursor(x, y)
messages <- append_to_last_message(messages, content)
message("sleep 1 sec to see new cursor position")
Sys.sleep(1)
} else if (name == "computer" && action == "left_click") {
message("* left click")
apple_left_click()
messages <- append_to_last_message(messages, content)
} else if (name == "computer" && action == "right_click") {
message("* right click")
apple_right_click()
messages <- append_to_last_message(messages, content)
} else if (name == "bash") {
cmd0 <- content$input$command
cmd1 <- sub("(DISPLAY=:1 ", "", cmd0, fixed = TRUE)
cmd2 <- sub(" &)", "", cmd1, fixed = TRUE)
message(sprintf("* Running command: open -a %s", cmd2))
args <- strsplit(cmd2, " ")[[1]]
if (length(args) == 2) {
apple_open_app(args[[1]], args[[2]])
} else {
apple_open_app(args[[1]])
}
messages <- append_to_last_message(messages, content)
}
}
} else {
break
}
}
}
https://docs.anthropic.com/en/api/messages
https://docs.anthropic.com/en/docs/build-with-claude/computer-use#computer-tool
https://github.com/anthropics/anthropic-quickstarts/blob/main/computer-use-demo/computer_use_demo/loop.py
* This a rough implementation of computer use with anthropic Claude in R
* It sometimes works beautifully, sometimes not
* If you remove important files, or email your credit card info to all your friends don't blame me, system automation is dangerous
* You'll need an api key linked to a credit card: https://console.anthropic.com/dashboard, env variable ANTHROPIC_API_KEY
* It was done for my own Mac laptop, for other systems you'll have to tweak the apple_*() functions
(you might use `ask::ask_in_place()` to adapt them to your system)
* lookup the '# tweak' commands to see what might be tweaked for your system,
namely the system message, the screen resolution, and the multiplier
factor used on coordinates (no idea why I needed one)
* I used a horrible hack not to use bash, but use `open -a ...` because for
some reason bash didn't find apps out of the box, this is brittle, this can and should be fixed
* You might need `brew install cliclick`
* The main function is `claude_computer_use()`
* Try things like :
# these should always work if you've set up the tool well
claude_computer_use("what's on my screen")
claude_computer_use("move the cursor to the center of the trash bin icon")
# This one will sometimes work sometimes not
claude_computer_use("find a good middle price vaccuum cleaner for me on amazon uk")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment