Skip to content

Instantly share code, notes, and snippets.

@lis186
Last active March 27, 2026 03:38
Show Gist options
  • Select an option

  • Save lis186/38d55b9a650d76711438103c1935165a to your computer and use it in GitHub Desktop.

Select an option

Save lis186/38d55b9a650d76711438103c1935165a to your computer and use it in GitHub Desktop.
用文言,可減 token 乎?— 實測文言文 vs 白話文 vs English 的 Token 數(Claude / GPT / Qwen / DeepSeek)
"""
用文言,可減 token 乎?— 實測文言文 vs 白話文 vs English 的 Token 數
起因:有人在 LinkedIn 用文言文寫了一篇「前端工程師修養論」,
底下有人問 Haiku 4.5「用文言,可減 token 乎?」,Haiku 回「可也」。
真的嗎?我們來實測。
測試方法:
- 取文言原文,翻譯成等義的白話文和英文
- 用 5 種主流 tokenizer 分別計算 token 數
- 逐句對比,找出差異來源
需要安裝:
pip install tiktoken tokenizers transformers
結論(劇透):
- Claude tokenizer: 文言 406 vs 白話 406 → 完全一樣
- 英文在所有 tokenizer 上都大幅領先(省 28%~62%)
- 文言文字數少 19%,但 token 數沒省到
- 「爪哇腳本」4 tokens,「JavaScript」1 token
"""
import tiktoken
from tokenizers import Tokenizer
# ================================================================
# 測試文本:同一篇內容的三種語體
# ================================================================
# 文言文原文 — 出自 LinkedIn 江中喬的貼文
classical = """君子編程,必也求乎。求者,乃得解於人工智能也。
汝乃一前端工程師也,於此,汝必撰一門面。
夫頁首者,門面之梁也。重簡便,不宜混亂。夫佈局者,貴乎流動而不滯;樣式者,取其簡約而不雜。手執腳本之利刃,運響應於掌心。雖瀏覽器千差萬別,心法一貫,則相容無礙。
若夫代碼之美,必也邏輯之嚴密、註釋之清明。
不求浮誇之風,但求用戶之易貶也,老嫗能解為佳。
故曰:工欲善其事,必先構其代碼、鞭其錯蟲、理其頭緒、空乏其版,所以動心忍性,增益吾所不能。
余勉也,為得一佳文得傳千古,汝必莫使多紕漏,徒留殘蟲,貽笑大方。
閱畢,即速寫超文本標記語言、階層樣式表及爪哇腳本,以達吾願、傳汝碼。"""
# 白話文翻譯(等義)
modern = """工程師寫程式,一定要追求品質。追求品質,就能從 AI 工具中獲得好的解答。
你是一個前端工程師,所以你必須做好頁面。
頁首是網站的門面,要簡潔,不要混亂。佈局要流暢、不能卡頓;樣式要簡約、不要繁雜。要熟練使用 JavaScript,靈活運用響應式設計。雖然瀏覽器各不相同,但核心方法是一致的,就能做到跨瀏覽器相容。
好的代碼,一定要邏輯嚴密、註釋清楚。
不追求花俏的風格,只要讓使用者容易理解就好,連不懂技術的人都能看懂最好。
所以說:要做好工作,必須先寫好代碼、修好 bug、整理好思路、清理好版本,這樣才能磨練心性,提升自己的能力。
我勉勵你,要寫出能流傳的好代碼,不要留下太多漏洞,否則留下一堆 bug,會被人笑話。
看完之後,就趕快去寫 HTML、CSS 和 JavaScript,來實現目標、交付你的程式碼。"""
# English translation (equivalent meaning)
english = """Engineers should pursue quality in their code. By pursuing quality, you can get good answers from AI tools.
You are a frontend engineer, so you must build a good interface.
The header is the face of a website — keep it simple, not cluttered. Layout should flow smoothly without stalling; styles should be minimal, not messy. Wield JavaScript skillfully, and master responsive design. Though browsers vary widely, the core approach is consistent, and you can achieve cross-browser compatibility.
Good code must have rigorous logic and clear comments.
Don't pursue flashy styles — just make it easy for users to understand. If even a non-technical person can follow it, that's ideal.
As the saying goes: to do good work, you must first write solid code, fix bugs, organize your thoughts, and clean up your versions. This is how you build resilience and improve your abilities.
I encourage you: write code worthy of lasting, and don't leave too many holes behind — otherwise you'll be left with bugs and become a laughingstock.
After reading this, go write HTML, CSS, and JavaScript right away, to achieve our goals and ship your code."""
texts = {"文言文": classical, "白話文": modern, "English": english}
# ================================================================
# 載入 Tokenizers
# ================================================================
def load_tokenizers():
tok_map = {}
# Claude (Xenova/claude-tokenizer — 公開的 Claude BPE tokenizer)
try:
claude_tok = Tokenizer.from_pretrained("Xenova/claude-tokenizer")
tok_map["Claude (all models)"] = lambda t: len(claude_tok.encode(t).ids)
except Exception as e:
print(f"⚠️ Claude tokenizer: {e}")
# GPT-4 / GPT-3.5 Turbo
enc_cl100k = tiktoken.get_encoding("cl100k_base")
tok_map["GPT-4 / 3.5-Turbo (cl100k)"] = lambda t: len(enc_cl100k.encode(t))
# GPT-4o / o1 / o3
enc_o200k = tiktoken.get_encoding("o200k_base")
tok_map["GPT-4o / o1 / o3 (o200k)"] = lambda t: len(enc_o200k.encode(t))
# Qwen 2.5
try:
from transformers import AutoTokenizer
qwen_tok = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B")
tok_map["Qwen 2.5"] = lambda t: len(qwen_tok.encode(t))
except Exception:
pass
# DeepSeek V2
try:
from transformers import AutoTokenizer
ds_tok = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-V2-Lite")
tok_map["DeepSeek V2"] = lambda t: len(ds_tok.encode(t))
except Exception:
pass
return tok_map
# ================================================================
# 主程式
# ================================================================
def main():
tok_map = load_tokenizers()
def char_count(t):
return len(t.replace("\n", "").replace(" ", ""))
# --- 總表 ---
print("=" * 85)
print("📊 文言文 vs 白話文 vs English — 跨模型 Token 數完整對比")
print("=" * 85)
print(f"\n📏 文本長度:")
for label, text in texts.items():
print(f" {label}: {char_count(text)} 字元 / {len(text.encode('utf-8'))} bytes")
header = f"\n{'Tokenizer':<32} {'文言文':>7} {'白話文':>7} {'English':>7} │ {'文vs白':>7} {'文vs英':>7} {'白vs英':>7}"
print(header)
print("─" * len(header.encode("ascii", "ignore").decode()))
all_results = {} # name -> (classical_tokens, modern_tokens, english_tokens)
for name, fn in tok_map.items():
c = fn(texts["文言文"])
m = fn(texts["白話文"])
e = fn(texts["English"])
all_results[name] = (c, m, e)
cm = f"{(1-c/m)*100:+.1f}%"
ce = f"{(1-c/e)*100:+.1f}%"
me = f"{(1-m/e)*100:+.1f}%"
print(f"{name:<32} {c:>7} {m:>7} {e:>7} │ {cm:>7} {ce:>7} {me:>7}")
print(f"\n +值 = 省了 token(前者比後者少)")
print(f" -值 = 多用 token(前者比後者多)")
print(f"\n📝 GPT 模型 → tokenizer 對應:")
print(f" cl100k_base → GPT-4, GPT-4 Turbo, GPT-3.5 Turbo")
print(f" o200k_base → GPT-4o, GPT-4o-mini, o1, o1-mini, o3, o3-mini")
# --- 逐句對比 ---
try:
claude_tok = Tokenizer.from_pretrained("Xenova/claude-tokenizer")
except Exception:
print("\n(Claude tokenizer 不可用,跳過逐句對比)")
return
print("\n" + "=" * 85)
print("📝 逐句對比 (Claude tokenizer)")
print("=" * 85)
c_lines = [l.strip() for l in classical.strip().split("\n") if l.strip()]
m_lines = [l.strip() for l in modern.strip().split("\n") if l.strip()]
e_lines = [l.strip() for l in english.strip().split("\n") if l.strip()]
max_len = max(len(c_lines), len(m_lines), len(e_lines))
c_lines += [""] * (max_len - len(c_lines))
m_lines += [""] * (max_len - len(m_lines))
e_lines += [""] * (max_len - len(e_lines))
print(f"\n{'#':<3} {'文言':>6} {'白話':>6} {'英文':>6} │ {'最少':>4} 摘要")
print("─" * 75)
total = {"文言": 0, "白話": 0, "英文": 0}
for i, (c, m, e) in enumerate(zip(c_lines, m_lines, e_lines)):
ct = len(claude_tok.encode(c).ids) if c else 0
mt = len(claude_tok.encode(m).ids) if m else 0
et = len(claude_tok.encode(e).ids) if e else 0
total["文言"] += ct
total["白話"] += mt
total["英文"] += et
vals = {"文言": ct, "白話": mt, "英文": et}
winner = min(vals, key=vals.get) if all(v > 0 for v in vals.values()) else "—"
snippet = c[:28] if c else ""
print(f"{i+1:<3} {ct:>6} {mt:>6} {et:>6} │ {winner:>4} {snippet}{'…' if len(c)>28 else ''}")
print("─" * 75)
winner = min(total, key=total.get)
print(f"{'Σ':<3} {total['文言']:>6} {total['白話']:>6} {total['英文']:>6} │ {winner:>4}")
# --- 經典案例:技術術語 ---
print("\n" + "=" * 85)
print("🔬 案例分析:技術術語的 token 切分")
print("=" * 85)
cases = [
("文言", "閱畢,即速寫超文本標記語言、階層樣式表及爪哇腳本,以達吾願、傳汝碼。"),
("白話", "看完之後,就趕快去寫 HTML、CSS 和 JavaScript,來實現目標、交付你的程式碼。"),
("英文", "After reading this, go write HTML, CSS, and JavaScript right away, to achieve our goals and ship your code."),
]
for label, text in cases:
result = claude_tok.encode(text)
print(f"\n {label} ({len(result.ids)} tokens): {text}")
print(f" 切分: {result.tokens}")
# --- 結論 ---
print("\n" + "=" * 85)
print("💡 結論")
print("=" * 85)
c_chars = char_count(texts["文言文"])
m_chars = char_count(texts["白話文"])
char_saving = (1 - c_chars / m_chars) * 100
n = 1
# 1. 字數 vs token 數的落差
claude_key = next((k for k in all_results if "Claude" in k), None)
if claude_key:
cc, cm, ce = all_results[claude_key]
diff_pct = (cc / cm - 1) * 100
if abs(diff_pct) < 1:
verdict = f"完全一樣({cc} vs {cm})"
elif diff_pct > 0:
verdict = f"文言反而多 {diff_pct:.1f}%({cc} vs {cm})"
else:
verdict = f"文言少 {abs(diff_pct):.1f}%({cc} vs {cm})"
print(f"\n {n}. 文言文字數少 {char_saving:.0f}%,但 token 數在 Claude 上{verdict}")
n += 1
# 2. 跨模型比較文言 vs 白話
cm_ratios = {name: (c / m - 1) * 100 for name, (c, m, e) in all_results.items() if m > 0}
classical_wins = {k: v for k, v in cm_ratios.items() if v < -1} # 文言省 >1%
modern_wins = {k: v for k, v in cm_ratios.items() if v > 1} # 白話省 >1%
ties = {k: v for k, v in cm_ratios.items() if abs(v) <= 1} # 差距 <=1%
if classical_wins and not modern_wins:
best_name = min(classical_wins, key=classical_wins.get)
print(f" {n}. 文言在所有 tokenizer 上都更省 token(最多:{best_name} 省 {abs(classical_wins[best_name]):.1f}%)")
elif modern_wins and not classical_wins:
worst_name = max(modern_wins, key=modern_wins.get)
print(f" {n}. 白話在所有 tokenizer 上都更省 token(最多:{worst_name} 省 {modern_wins[worst_name]:.1f}%)")
else:
# 混合結果:有的文言贏,有的白話贏
parts = []
if classical_wins:
best = min(classical_wins, key=classical_wins.get)
parts.append(f"文言較省的有 {len(classical_wins)} 個(如 {best} 省 {abs(classical_wins[best]):.1f}%)")
if modern_wins:
worst = max(modern_wins, key=modern_wins.get)
parts.append(f"白話較省的有 {len(modern_wins)} 個(如 {worst} 省 {modern_wins[worst]:.1f}%)")
if ties:
parts.append(f"差距 <1% 的有 {len(ties)} 個")
print(f" {n}. 跨模型結果不一致:{';'.join(parts)}")
n += 1
# 3. 英文的表現
en_vs_c = {name: (1 - e / c) * 100 for name, (c, m, e) in all_results.items() if e > 0 and c > 0}
en_vs_m = {name: (1 - e / m) * 100 for name, (c, m, e) in all_results.items() if e > 0 and m > 0}
en_beats_c = all(v > 0 for v in en_vs_c.values()) # 英文是否在所有 tokenizer 上都比文言少
en_beats_m = all(v > 0 for v in en_vs_m.values()) # 英文是否在所有 tokenizer 上都比白話少
if en_beats_c and en_beats_m:
range_min = int(min(min(en_vs_c.values()), min(en_vs_m.values())))
range_max = int(max(max(en_vs_c.values()), max(en_vs_m.values())))
print(f" {n}. 英文在所有 tokenizer 上都比中文省(省 {range_min}%~{range_max}%)")
elif en_beats_c:
range_min, range_max = int(min(en_vs_c.values())), int(max(en_vs_c.values()))
print(f" {n}. 英文在所有 tokenizer 上都比文言省(省 {range_min}%~{range_max}%),但個別 tokenizer 上與白話互有勝負")
else:
print(f" {n}. 英文在多數 tokenizer 上比中文省 token,但非絕對(取決於 tokenizer 對中文的優化程度)")
n += 1
# 4. 原因分析
print(f" {n}. 原因:BPE tokenizer 按訓練語料統計頻率切分")
print(f" - 語料中出現頻率越高的字詞序列,越容易被合併成單一 token")
print(f" - 文言生僻字可能被拆成多個 byte-level tokens,抵銷字數優勢")
print(f" - 對中文優化較深的 tokenizer(如 Qwen),現代白話的效率更高")
n += 1
# 5. 建議
print(f" {n}. 字數少 ≠ token 少。想省 token,關鍵是選用 tokenizer 訓練語料中的高頻詞彙,")
print(f" 而非追求文體上的簡短")
print()
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment