devasheeshG · March 26, 2026 20:42
diff --git a/llm_cost_calc.py b/llm_cost_calc.py
 def estimate_agent_cost(
    # pricing (per token)
    input_price: float,
    output_price: float,
    cache_read_price: float,
    cache_write_price: float,

    # initial turn
    # global system prompt + tool definitions
    system_prompt_tokens: int,
    # user query
    initial_query_tokens: int,
    # model output (likely contains first tool call)
    initial_output_tokens: int,

    # tool call loop
    num_tool_invocations: int,
    # model output per tool call (reasoning + next tool call)
    output_tokens_per_tool_invocation: int,
    # tool result fed back as input
    tool_result_tokens_per_tool_invocation: int,

    # final response
    # final model output after all tool calls
    final_response_output_tokens: int,
 ) -> float:

    total_cost = 0.0

    # turn 1: initial query
    # input tokens = system prompt + user query (not yet cached)
    input_tokens_turn1 = system_prompt_tokens + initial_query_tokens

    # everything up to and including the model's first output gets cached
    # for the next turn (output is sent back as conversation history)
    cached_after_turn1 = input_tokens_turn1 + initial_output_tokens

    turn1_cost = (
        # uncached input
        input_tokens_turn1 * input_price
        # model output
        + initial_output_tokens * output_price
        # write entire context to cache
        + cached_after_turn1 * cache_write_price
    )
    total_cost += turn1_cost

    # turns 2..N+1: tool call loop
    # grows each iteration as new tokens are added
    cached_tokens = cached_after_turn1

    for _ in range(num_tool_invocations):
        turn_cost = (
            # read full prior context from cache
            cached_tokens * cache_read_price
            # tool result is new uncached input
            + tool_result_tokens_per_tool_invocation * input_price
            # model output for this turn
            + output_tokens_per_tool_invocation * output_price
            # cache the new tokens (tool result + model output)
            # note: charged here, but these tokens are only read from cache starting the next turn
            + (output_tokens_per_tool_invocation + tool_result_tokens_per_tool_invocation) * cache_write_price
        )
        total_cost += turn_cost

        # advance cached context by the new tokens added this turn
        cached_tokens += output_tokens_per_tool_invocation + tool_result_tokens_per_tool_invocation

    # final turn: response generation
    final_turn_cost = (
        # read full accumulated context from cache
        cached_tokens * cache_read_price
        # final model output
        + final_response_output_tokens * output_price
        # cache final output for potential follow-up turns
        + final_response_output_tokens * cache_write_price
    )
    total_cost += final_turn_cost

    return round(total_cost, 4)


 if __name__ == "__main__":
    # opus 4.6 bedrock pricing (per token)
    input_price = 10.00 / 1_000_000
    output_price = 25.00 / 1_000_000
    cache_read_price = 0.55 / 1_000_000
    cache_write_price = 6.25 / 1_000_000

    # initial turn
    system_prompt_tokens = 22_000
    initial_query_tokens = 1_000
    initial_output_tokens = 10_000

    # tool call loop
    num_tool_invocations = 100
    output_tokens_per_tool_invocation = 5_000
    tool_result_tokens_per_tool_invocation = 2_000

    # final response
    final_response_output_tokens = 10_000

    result = estimate_agent_cost(
        input_price=input_price,
        output_price=output_price,
        cache_read_price=cache_read_price,
        cache_write_price=cache_write_price,
        system_prompt_tokens=system_prompt_tokens,
        initial_query_tokens=initial_query_tokens,
        initial_output_tokens=initial_output_tokens,
        num_tool_invocations=num_tool_invocations,
        output_tokens_per_tool_invocation=output_tokens_per_tool_invocation,
        tool_result_tokens_per_tool_invocation=tool_result_tokens_per_tool_invocation,
        final_response_output_tokens=final_response_output_tokens,
    )
    print(result)
	def estimate_agent_cost(
	# pricing (per token)
	input_price: float,
	output_price: float,
	cache_read_price: float,
	cache_write_price: float,

	# initial turn
	# global system prompt + tool definitions
	system_prompt_tokens: int,
	# user query
	initial_query_tokens: int,
	# model output (likely contains first tool call)
	initial_output_tokens: int,

	# tool call loop
	num_tool_invocations: int,
	# model output per tool call (reasoning + next tool call)
	output_tokens_per_tool_invocation: int,
	# tool result fed back as input
	tool_result_tokens_per_tool_invocation: int,

	# final response
	# final model output after all tool calls
	final_response_output_tokens: int,
	) -> float:

	total_cost = 0.0

	# turn 1: initial query
	# input tokens = system prompt + user query (not yet cached)
	input_tokens_turn1 = system_prompt_tokens + initial_query_tokens

	# everything up to and including the model's first output gets cached
	# for the next turn (output is sent back as conversation history)
	cached_after_turn1 = input_tokens_turn1 + initial_output_tokens

	turn1_cost = (
	# uncached input
	input_tokens_turn1 * input_price
	# model output
	+ initial_output_tokens * output_price
	# write entire context to cache
	+ cached_after_turn1 * cache_write_price
	)
	total_cost += turn1_cost

	# turns 2..N+1: tool call loop
	# grows each iteration as new tokens are added
	cached_tokens = cached_after_turn1

	for _ in range(num_tool_invocations):
	turn_cost = (
	# read full prior context from cache
	cached_tokens * cache_read_price
	# tool result is new uncached input
	+ tool_result_tokens_per_tool_invocation * input_price
	# model output for this turn
	+ output_tokens_per_tool_invocation * output_price
	# cache the new tokens (tool result + model output)
	# note: charged here, but these tokens are only read from cache starting the next turn
	+ (output_tokens_per_tool_invocation + tool_result_tokens_per_tool_invocation) * cache_write_price
	)
	total_cost += turn_cost

	# advance cached context by the new tokens added this turn
	cached_tokens += output_tokens_per_tool_invocation + tool_result_tokens_per_tool_invocation

	# final turn: response generation
	final_turn_cost = (
	# read full accumulated context from cache
	cached_tokens * cache_read_price
	# final model output
	+ final_response_output_tokens * output_price
	# cache final output for potential follow-up turns
	+ final_response_output_tokens * cache_write_price
	)
	total_cost += final_turn_cost

	return round(total_cost, 4)


	if __name__ == "__main__":
	# opus 4.6 bedrock pricing (per token)
	input_price = 10.00 / 1_000_000
	output_price = 25.00 / 1_000_000
	cache_read_price = 0.55 / 1_000_000
	cache_write_price = 6.25 / 1_000_000

	# initial turn
	system_prompt_tokens = 22_000
	initial_query_tokens = 1_000
	initial_output_tokens = 10_000

	# tool call loop
	num_tool_invocations = 100
	output_tokens_per_tool_invocation = 5_000
	tool_result_tokens_per_tool_invocation = 2_000

	# final response
	final_response_output_tokens = 10_000

	result = estimate_agent_cost(
	input_price=input_price,
	output_price=output_price,
	cache_read_price=cache_read_price,
	cache_write_price=cache_write_price,
	system_prompt_tokens=system_prompt_tokens,
	initial_query_tokens=initial_query_tokens,
	initial_output_tokens=initial_output_tokens,
	num_tool_invocations=num_tool_invocations,
	output_tokens_per_tool_invocation=output_tokens_per_tool_invocation,
	tool_result_tokens_per_tool_invocation=tool_result_tokens_per_tool_invocation,
	final_response_output_tokens=final_response_output_tokens,
	)
	print(result)
No results found