Created
May 6, 2025 17:28
-
-
Save pedramamini/ae9a881b13d89faf0a46d43f0b30bc7d to your computer and use it in GitHub Desktop.
Count tokens
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import sys | |
import tiktoken | |
DEFAULT_ENCODING = "cl100k_base" | |
def count_tokens(encoding_name, text): | |
"""Count the number of tokens in the given text using the specified encoding.""" | |
enc = tiktoken.get_encoding(encoding_name) | |
return len(enc.encode(text)) | |
def safe_read_file(path): | |
"""Read a file as UTF-8, ignoring decoding errors.""" | |
with open(path, "r", encoding="utf-8", errors="ignore") as f: | |
return f.read() | |
def safe_read_stdin(): | |
""" | |
Read from stdin in a way that ignores decoding errors. | |
We use sys.stdin.buffer (the raw binary stream) and decode with errors='ignore'. | |
""" | |
try: | |
# sys.stdin.buffer should be available in standard CPython. | |
return sys.stdin.buffer.read().decode("utf-8", errors="ignore") | |
except AttributeError: | |
# Fallback: if sys.stdin has no buffer, use the text stream. | |
return sys.stdin.read() | |
def main(): | |
encoding_name = DEFAULT_ENCODING | |
text = "" | |
# Check command-line arguments. | |
# If there's a single argument, try to open it as a file. | |
# If that fails, assume it's an encoding and read from stdin. | |
if len(sys.argv) > 1 and sys.argv[1] not in ("-", "--"): | |
if len(sys.argv) == 2: | |
try: | |
text = safe_read_file(sys.argv[1]) | |
except FileNotFoundError: | |
# If the file isn't found, treat the argument as an encoding. | |
encoding_name = sys.argv[1] | |
text = safe_read_stdin() | |
else: | |
# If there are two arguments, assume the first is encoding and the second is file path. | |
encoding_name = sys.argv[1] | |
file_path = sys.argv[2] | |
try: | |
text = safe_read_file(file_path) | |
except FileNotFoundError: | |
print(f"Error: File '{file_path}' not found.", file=sys.stderr) | |
sys.exit(1) | |
else: | |
# No arguments (or "-" or "--" provided): read from stdin using the default encoding. | |
text = safe_read_stdin() | |
print(count_tokens(encoding_name, text)) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment