Skip to content

Instantly share code, notes, and snippets.

@devasheeshG
Last active March 26, 2026 20:42
Show Gist options
  • Select an option

  • Save devasheeshG/1d96218cd608e241643630ef10eadb97 to your computer and use it in GitHub Desktop.

Select an option

Save devasheeshG/1d96218cd608e241643630ef10eadb97 to your computer and use it in GitHub Desktop.
Cost estimator for agentic LLM runs with prompt caching — models per-turn input, output, cache read/write across tool call loops.
def estimate_agent_cost(
# pricing (per token)
input_price: float,
output_price: float,
cache_read_price: float,
cache_write_price: float,
# initial turn
# global system prompt + tool definitions
system_prompt_tokens: int,
# user query
initial_query_tokens: int,
# model output (likely contains first tool call)
initial_output_tokens: int,
# tool call loop
num_tool_invocations: int,
# model output per tool call (reasoning + next tool call)
output_tokens_per_tool_invocation: int,
# tool result fed back as input
tool_result_tokens_per_tool_invocation: int,
# final response
# final model output after all tool calls
final_response_output_tokens: int,
) -> float:
total_cost = 0.0
# turn 1: initial query
# input tokens = system prompt + user query (not yet cached)
input_tokens_turn1 = system_prompt_tokens + initial_query_tokens
# everything up to and including the model's first output gets cached
# for the next turn (output is sent back as conversation history)
cached_after_turn1 = input_tokens_turn1 + initial_output_tokens
turn1_cost = (
# uncached input
input_tokens_turn1 * input_price
# model output
+ initial_output_tokens * output_price
# write entire context to cache
+ cached_after_turn1 * cache_write_price
)
total_cost += turn1_cost
# turns 2..N+1: tool call loop
# grows each iteration as new tokens are added
cached_tokens = cached_after_turn1
for _ in range(num_tool_invocations):
turn_cost = (
# read full prior context from cache
cached_tokens * cache_read_price
# tool result is new uncached input
+ tool_result_tokens_per_tool_invocation * input_price
# model output for this turn
+ output_tokens_per_tool_invocation * output_price
# cache the new tokens (tool result + model output)
# note: charged here, but these tokens are only read from cache starting the next turn
+ (output_tokens_per_tool_invocation + tool_result_tokens_per_tool_invocation) * cache_write_price
)
total_cost += turn_cost
# advance cached context by the new tokens added this turn
cached_tokens += output_tokens_per_tool_invocation + tool_result_tokens_per_tool_invocation
# final turn: response generation
final_turn_cost = (
# read full accumulated context from cache
cached_tokens * cache_read_price
# final model output
+ final_response_output_tokens * output_price
# cache final output for potential follow-up turns
+ final_response_output_tokens * cache_write_price
)
total_cost += final_turn_cost
return round(total_cost, 4)
if __name__ == "__main__":
# opus 4.6 bedrock pricing (per token)
input_price = 10.00 / 1_000_000
output_price = 25.00 / 1_000_000
cache_read_price = 0.55 / 1_000_000
cache_write_price = 6.25 / 1_000_000
# initial turn
system_prompt_tokens = 22_000
initial_query_tokens = 1_000
initial_output_tokens = 10_000
# tool call loop
num_tool_invocations = 100
output_tokens_per_tool_invocation = 5_000
tool_result_tokens_per_tool_invocation = 2_000
# final response
final_response_output_tokens = 10_000
result = estimate_agent_cost(
input_price=input_price,
output_price=output_price,
cache_read_price=cache_read_price,
cache_write_price=cache_write_price,
system_prompt_tokens=system_prompt_tokens,
initial_query_tokens=initial_query_tokens,
initial_output_tokens=initial_output_tokens,
num_tool_invocations=num_tool_invocations,
output_tokens_per_tool_invocation=output_tokens_per_tool_invocation,
tool_result_tokens_per_tool_invocation=tool_result_tokens_per_tool_invocation,
final_response_output_tokens=final_response_output_tokens,
)
print(result)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment