Skip to content

Instantly share code, notes, and snippets.

@pedramamini
Created May 6, 2025 17:28
Show Gist options
  • Save pedramamini/ae9a881b13d89faf0a46d43f0b30bc7d to your computer and use it in GitHub Desktop.
Save pedramamini/ae9a881b13d89faf0a46d43f0b30bc7d to your computer and use it in GitHub Desktop.
Count tokens
#!/usr/bin/env python3
import sys
import tiktoken
DEFAULT_ENCODING = "cl100k_base"
def count_tokens(encoding_name, text):
"""Count the number of tokens in the given text using the specified encoding."""
enc = tiktoken.get_encoding(encoding_name)
return len(enc.encode(text))
def safe_read_file(path):
"""Read a file as UTF-8, ignoring decoding errors."""
with open(path, "r", encoding="utf-8", errors="ignore") as f:
return f.read()
def safe_read_stdin():
"""
Read from stdin in a way that ignores decoding errors.
We use sys.stdin.buffer (the raw binary stream) and decode with errors='ignore'.
"""
try:
# sys.stdin.buffer should be available in standard CPython.
return sys.stdin.buffer.read().decode("utf-8", errors="ignore")
except AttributeError:
# Fallback: if sys.stdin has no buffer, use the text stream.
return sys.stdin.read()
def main():
encoding_name = DEFAULT_ENCODING
text = ""
# Check command-line arguments.
# If there's a single argument, try to open it as a file.
# If that fails, assume it's an encoding and read from stdin.
if len(sys.argv) > 1 and sys.argv[1] not in ("-", "--"):
if len(sys.argv) == 2:
try:
text = safe_read_file(sys.argv[1])
except FileNotFoundError:
# If the file isn't found, treat the argument as an encoding.
encoding_name = sys.argv[1]
text = safe_read_stdin()
else:
# If there are two arguments, assume the first is encoding and the second is file path.
encoding_name = sys.argv[1]
file_path = sys.argv[2]
try:
text = safe_read_file(file_path)
except FileNotFoundError:
print(f"Error: File '{file_path}' not found.", file=sys.stderr)
sys.exit(1)
else:
# No arguments (or "-" or "--" provided): read from stdin using the default encoding.
text = safe_read_stdin()
print(count_tokens(encoding_name, text))
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment