Last active
March 26, 2026 20:42
-
-
Save devasheeshG/1d96218cd608e241643630ef10eadb97 to your computer and use it in GitHub Desktop.
Cost estimator for agentic LLM runs with prompt caching — models per-turn input, output, cache read/write across tool call loops.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def estimate_agent_cost( | |
| # pricing (per token) | |
| input_price: float, | |
| output_price: float, | |
| cache_read_price: float, | |
| cache_write_price: float, | |
| # initial turn | |
| # global system prompt + tool definitions | |
| system_prompt_tokens: int, | |
| # user query | |
| initial_query_tokens: int, | |
| # model output (likely contains first tool call) | |
| initial_output_tokens: int, | |
| # tool call loop | |
| num_tool_invocations: int, | |
| # model output per tool call (reasoning + next tool call) | |
| output_tokens_per_tool_invocation: int, | |
| # tool result fed back as input | |
| tool_result_tokens_per_tool_invocation: int, | |
| # final response | |
| # final model output after all tool calls | |
| final_response_output_tokens: int, | |
| ) -> float: | |
| total_cost = 0.0 | |
| # turn 1: initial query | |
| # input tokens = system prompt + user query (not yet cached) | |
| input_tokens_turn1 = system_prompt_tokens + initial_query_tokens | |
| # everything up to and including the model's first output gets cached | |
| # for the next turn (output is sent back as conversation history) | |
| cached_after_turn1 = input_tokens_turn1 + initial_output_tokens | |
| turn1_cost = ( | |
| # uncached input | |
| input_tokens_turn1 * input_price | |
| # model output | |
| + initial_output_tokens * output_price | |
| # write entire context to cache | |
| + cached_after_turn1 * cache_write_price | |
| ) | |
| total_cost += turn1_cost | |
| # turns 2..N+1: tool call loop | |
| # grows each iteration as new tokens are added | |
| cached_tokens = cached_after_turn1 | |
| for _ in range(num_tool_invocations): | |
| turn_cost = ( | |
| # read full prior context from cache | |
| cached_tokens * cache_read_price | |
| # tool result is new uncached input | |
| + tool_result_tokens_per_tool_invocation * input_price | |
| # model output for this turn | |
| + output_tokens_per_tool_invocation * output_price | |
| # cache the new tokens (tool result + model output) | |
| # note: charged here, but these tokens are only read from cache starting the next turn | |
| + (output_tokens_per_tool_invocation + tool_result_tokens_per_tool_invocation) * cache_write_price | |
| ) | |
| total_cost += turn_cost | |
| # advance cached context by the new tokens added this turn | |
| cached_tokens += output_tokens_per_tool_invocation + tool_result_tokens_per_tool_invocation | |
| # final turn: response generation | |
| final_turn_cost = ( | |
| # read full accumulated context from cache | |
| cached_tokens * cache_read_price | |
| # final model output | |
| + final_response_output_tokens * output_price | |
| # cache final output for potential follow-up turns | |
| + final_response_output_tokens * cache_write_price | |
| ) | |
| total_cost += final_turn_cost | |
| return round(total_cost, 4) | |
| if __name__ == "__main__": | |
| # opus 4.6 bedrock pricing (per token) | |
| input_price = 10.00 / 1_000_000 | |
| output_price = 25.00 / 1_000_000 | |
| cache_read_price = 0.55 / 1_000_000 | |
| cache_write_price = 6.25 / 1_000_000 | |
| # initial turn | |
| system_prompt_tokens = 22_000 | |
| initial_query_tokens = 1_000 | |
| initial_output_tokens = 10_000 | |
| # tool call loop | |
| num_tool_invocations = 100 | |
| output_tokens_per_tool_invocation = 5_000 | |
| tool_result_tokens_per_tool_invocation = 2_000 | |
| # final response | |
| final_response_output_tokens = 10_000 | |
| result = estimate_agent_cost( | |
| input_price=input_price, | |
| output_price=output_price, | |
| cache_read_price=cache_read_price, | |
| cache_write_price=cache_write_price, | |
| system_prompt_tokens=system_prompt_tokens, | |
| initial_query_tokens=initial_query_tokens, | |
| initial_output_tokens=initial_output_tokens, | |
| num_tool_invocations=num_tool_invocations, | |
| output_tokens_per_tool_invocation=output_tokens_per_tool_invocation, | |
| tool_result_tokens_per_tool_invocation=tool_result_tokens_per_tool_invocation, | |
| final_response_output_tokens=final_response_output_tokens, | |
| ) | |
| print(result) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment