Last active
March 27, 2026 03:38
-
-
Save lis186/38d55b9a650d76711438103c1935165a to your computer and use it in GitHub Desktop.
用文言,可減 token 乎?— 實測文言文 vs 白話文 vs English 的 Token 數(Claude / GPT / Qwen / DeepSeek)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """ | |
| 用文言,可減 token 乎?— 實測文言文 vs 白話文 vs English 的 Token 數 | |
| 起因:有人在 LinkedIn 用文言文寫了一篇「前端工程師修養論」, | |
| 底下有人問 Haiku 4.5「用文言,可減 token 乎?」,Haiku 回「可也」。 | |
| 真的嗎?我們來實測。 | |
| 測試方法: | |
| - 取文言原文,翻譯成等義的白話文和英文 | |
| - 用 5 種主流 tokenizer 分別計算 token 數 | |
| - 逐句對比,找出差異來源 | |
| 需要安裝: | |
| pip install tiktoken tokenizers transformers | |
| 結論(劇透): | |
| - Claude tokenizer: 文言 406 vs 白話 406 → 完全一樣 | |
| - 英文在所有 tokenizer 上都大幅領先(省 28%~62%) | |
| - 文言文字數少 19%,但 token 數沒省到 | |
| - 「爪哇腳本」4 tokens,「JavaScript」1 token | |
| """ | |
| import tiktoken | |
| from tokenizers import Tokenizer | |
| # ================================================================ | |
| # 測試文本:同一篇內容的三種語體 | |
| # ================================================================ | |
| # 文言文原文 — 出自 LinkedIn 江中喬的貼文 | |
| classical = """君子編程,必也求乎。求者,乃得解於人工智能也。 | |
| 汝乃一前端工程師也,於此,汝必撰一門面。 | |
| 夫頁首者,門面之梁也。重簡便,不宜混亂。夫佈局者,貴乎流動而不滯;樣式者,取其簡約而不雜。手執腳本之利刃,運響應於掌心。雖瀏覽器千差萬別,心法一貫,則相容無礙。 | |
| 若夫代碼之美,必也邏輯之嚴密、註釋之清明。 | |
| 不求浮誇之風,但求用戶之易貶也,老嫗能解為佳。 | |
| 故曰:工欲善其事,必先構其代碼、鞭其錯蟲、理其頭緒、空乏其版,所以動心忍性,增益吾所不能。 | |
| 余勉也,為得一佳文得傳千古,汝必莫使多紕漏,徒留殘蟲,貽笑大方。 | |
| 閱畢,即速寫超文本標記語言、階層樣式表及爪哇腳本,以達吾願、傳汝碼。""" | |
| # 白話文翻譯(等義) | |
| modern = """工程師寫程式,一定要追求品質。追求品質,就能從 AI 工具中獲得好的解答。 | |
| 你是一個前端工程師,所以你必須做好頁面。 | |
| 頁首是網站的門面,要簡潔,不要混亂。佈局要流暢、不能卡頓;樣式要簡約、不要繁雜。要熟練使用 JavaScript,靈活運用響應式設計。雖然瀏覽器各不相同,但核心方法是一致的,就能做到跨瀏覽器相容。 | |
| 好的代碼,一定要邏輯嚴密、註釋清楚。 | |
| 不追求花俏的風格,只要讓使用者容易理解就好,連不懂技術的人都能看懂最好。 | |
| 所以說:要做好工作,必須先寫好代碼、修好 bug、整理好思路、清理好版本,這樣才能磨練心性,提升自己的能力。 | |
| 我勉勵你,要寫出能流傳的好代碼,不要留下太多漏洞,否則留下一堆 bug,會被人笑話。 | |
| 看完之後,就趕快去寫 HTML、CSS 和 JavaScript,來實現目標、交付你的程式碼。""" | |
| # English translation (equivalent meaning) | |
| english = """Engineers should pursue quality in their code. By pursuing quality, you can get good answers from AI tools. | |
| You are a frontend engineer, so you must build a good interface. | |
| The header is the face of a website — keep it simple, not cluttered. Layout should flow smoothly without stalling; styles should be minimal, not messy. Wield JavaScript skillfully, and master responsive design. Though browsers vary widely, the core approach is consistent, and you can achieve cross-browser compatibility. | |
| Good code must have rigorous logic and clear comments. | |
| Don't pursue flashy styles — just make it easy for users to understand. If even a non-technical person can follow it, that's ideal. | |
| As the saying goes: to do good work, you must first write solid code, fix bugs, organize your thoughts, and clean up your versions. This is how you build resilience and improve your abilities. | |
| I encourage you: write code worthy of lasting, and don't leave too many holes behind — otherwise you'll be left with bugs and become a laughingstock. | |
| After reading this, go write HTML, CSS, and JavaScript right away, to achieve our goals and ship your code.""" | |
| texts = {"文言文": classical, "白話文": modern, "English": english} | |
| # ================================================================ | |
| # 載入 Tokenizers | |
| # ================================================================ | |
| def load_tokenizers(): | |
| tok_map = {} | |
| # Claude (Xenova/claude-tokenizer — 公開的 Claude BPE tokenizer) | |
| try: | |
| claude_tok = Tokenizer.from_pretrained("Xenova/claude-tokenizer") | |
| tok_map["Claude (all models)"] = lambda t: len(claude_tok.encode(t).ids) | |
| except Exception as e: | |
| print(f"⚠️ Claude tokenizer: {e}") | |
| # GPT-4 / GPT-3.5 Turbo | |
| enc_cl100k = tiktoken.get_encoding("cl100k_base") | |
| tok_map["GPT-4 / 3.5-Turbo (cl100k)"] = lambda t: len(enc_cl100k.encode(t)) | |
| # GPT-4o / o1 / o3 | |
| enc_o200k = tiktoken.get_encoding("o200k_base") | |
| tok_map["GPT-4o / o1 / o3 (o200k)"] = lambda t: len(enc_o200k.encode(t)) | |
| # Qwen 2.5 | |
| try: | |
| from transformers import AutoTokenizer | |
| qwen_tok = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B") | |
| tok_map["Qwen 2.5"] = lambda t: len(qwen_tok.encode(t)) | |
| except Exception: | |
| pass | |
| # DeepSeek V2 | |
| try: | |
| from transformers import AutoTokenizer | |
| ds_tok = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-V2-Lite") | |
| tok_map["DeepSeek V2"] = lambda t: len(ds_tok.encode(t)) | |
| except Exception: | |
| pass | |
| return tok_map | |
| # ================================================================ | |
| # 主程式 | |
| # ================================================================ | |
| def main(): | |
| tok_map = load_tokenizers() | |
| def char_count(t): | |
| return len(t.replace("\n", "").replace(" ", "")) | |
| # --- 總表 --- | |
| print("=" * 85) | |
| print("📊 文言文 vs 白話文 vs English — 跨模型 Token 數完整對比") | |
| print("=" * 85) | |
| print(f"\n📏 文本長度:") | |
| for label, text in texts.items(): | |
| print(f" {label}: {char_count(text)} 字元 / {len(text.encode('utf-8'))} bytes") | |
| header = f"\n{'Tokenizer':<32} {'文言文':>7} {'白話文':>7} {'English':>7} │ {'文vs白':>7} {'文vs英':>7} {'白vs英':>7}" | |
| print(header) | |
| print("─" * len(header.encode("ascii", "ignore").decode())) | |
| all_results = {} # name -> (classical_tokens, modern_tokens, english_tokens) | |
| for name, fn in tok_map.items(): | |
| c = fn(texts["文言文"]) | |
| m = fn(texts["白話文"]) | |
| e = fn(texts["English"]) | |
| all_results[name] = (c, m, e) | |
| cm = f"{(1-c/m)*100:+.1f}%" | |
| ce = f"{(1-c/e)*100:+.1f}%" | |
| me = f"{(1-m/e)*100:+.1f}%" | |
| print(f"{name:<32} {c:>7} {m:>7} {e:>7} │ {cm:>7} {ce:>7} {me:>7}") | |
| print(f"\n +值 = 省了 token(前者比後者少)") | |
| print(f" -值 = 多用 token(前者比後者多)") | |
| print(f"\n📝 GPT 模型 → tokenizer 對應:") | |
| print(f" cl100k_base → GPT-4, GPT-4 Turbo, GPT-3.5 Turbo") | |
| print(f" o200k_base → GPT-4o, GPT-4o-mini, o1, o1-mini, o3, o3-mini") | |
| # --- 逐句對比 --- | |
| try: | |
| claude_tok = Tokenizer.from_pretrained("Xenova/claude-tokenizer") | |
| except Exception: | |
| print("\n(Claude tokenizer 不可用,跳過逐句對比)") | |
| return | |
| print("\n" + "=" * 85) | |
| print("📝 逐句對比 (Claude tokenizer)") | |
| print("=" * 85) | |
| c_lines = [l.strip() for l in classical.strip().split("\n") if l.strip()] | |
| m_lines = [l.strip() for l in modern.strip().split("\n") if l.strip()] | |
| e_lines = [l.strip() for l in english.strip().split("\n") if l.strip()] | |
| max_len = max(len(c_lines), len(m_lines), len(e_lines)) | |
| c_lines += [""] * (max_len - len(c_lines)) | |
| m_lines += [""] * (max_len - len(m_lines)) | |
| e_lines += [""] * (max_len - len(e_lines)) | |
| print(f"\n{'#':<3} {'文言':>6} {'白話':>6} {'英文':>6} │ {'最少':>4} 摘要") | |
| print("─" * 75) | |
| total = {"文言": 0, "白話": 0, "英文": 0} | |
| for i, (c, m, e) in enumerate(zip(c_lines, m_lines, e_lines)): | |
| ct = len(claude_tok.encode(c).ids) if c else 0 | |
| mt = len(claude_tok.encode(m).ids) if m else 0 | |
| et = len(claude_tok.encode(e).ids) if e else 0 | |
| total["文言"] += ct | |
| total["白話"] += mt | |
| total["英文"] += et | |
| vals = {"文言": ct, "白話": mt, "英文": et} | |
| winner = min(vals, key=vals.get) if all(v > 0 for v in vals.values()) else "—" | |
| snippet = c[:28] if c else "" | |
| print(f"{i+1:<3} {ct:>6} {mt:>6} {et:>6} │ {winner:>4} {snippet}{'…' if len(c)>28 else ''}") | |
| print("─" * 75) | |
| winner = min(total, key=total.get) | |
| print(f"{'Σ':<3} {total['文言']:>6} {total['白話']:>6} {total['英文']:>6} │ {winner:>4}") | |
| # --- 經典案例:技術術語 --- | |
| print("\n" + "=" * 85) | |
| print("🔬 案例分析:技術術語的 token 切分") | |
| print("=" * 85) | |
| cases = [ | |
| ("文言", "閱畢,即速寫超文本標記語言、階層樣式表及爪哇腳本,以達吾願、傳汝碼。"), | |
| ("白話", "看完之後,就趕快去寫 HTML、CSS 和 JavaScript,來實現目標、交付你的程式碼。"), | |
| ("英文", "After reading this, go write HTML, CSS, and JavaScript right away, to achieve our goals and ship your code."), | |
| ] | |
| for label, text in cases: | |
| result = claude_tok.encode(text) | |
| print(f"\n {label} ({len(result.ids)} tokens): {text}") | |
| print(f" 切分: {result.tokens}") | |
| # --- 結論 --- | |
| print("\n" + "=" * 85) | |
| print("💡 結論") | |
| print("=" * 85) | |
| c_chars = char_count(texts["文言文"]) | |
| m_chars = char_count(texts["白話文"]) | |
| char_saving = (1 - c_chars / m_chars) * 100 | |
| n = 1 | |
| # 1. 字數 vs token 數的落差 | |
| claude_key = next((k for k in all_results if "Claude" in k), None) | |
| if claude_key: | |
| cc, cm, ce = all_results[claude_key] | |
| diff_pct = (cc / cm - 1) * 100 | |
| if abs(diff_pct) < 1: | |
| verdict = f"完全一樣({cc} vs {cm})" | |
| elif diff_pct > 0: | |
| verdict = f"文言反而多 {diff_pct:.1f}%({cc} vs {cm})" | |
| else: | |
| verdict = f"文言少 {abs(diff_pct):.1f}%({cc} vs {cm})" | |
| print(f"\n {n}. 文言文字數少 {char_saving:.0f}%,但 token 數在 Claude 上{verdict}") | |
| n += 1 | |
| # 2. 跨模型比較文言 vs 白話 | |
| cm_ratios = {name: (c / m - 1) * 100 for name, (c, m, e) in all_results.items() if m > 0} | |
| classical_wins = {k: v for k, v in cm_ratios.items() if v < -1} # 文言省 >1% | |
| modern_wins = {k: v for k, v in cm_ratios.items() if v > 1} # 白話省 >1% | |
| ties = {k: v for k, v in cm_ratios.items() if abs(v) <= 1} # 差距 <=1% | |
| if classical_wins and not modern_wins: | |
| best_name = min(classical_wins, key=classical_wins.get) | |
| print(f" {n}. 文言在所有 tokenizer 上都更省 token(最多:{best_name} 省 {abs(classical_wins[best_name]):.1f}%)") | |
| elif modern_wins and not classical_wins: | |
| worst_name = max(modern_wins, key=modern_wins.get) | |
| print(f" {n}. 白話在所有 tokenizer 上都更省 token(最多:{worst_name} 省 {modern_wins[worst_name]:.1f}%)") | |
| else: | |
| # 混合結果:有的文言贏,有的白話贏 | |
| parts = [] | |
| if classical_wins: | |
| best = min(classical_wins, key=classical_wins.get) | |
| parts.append(f"文言較省的有 {len(classical_wins)} 個(如 {best} 省 {abs(classical_wins[best]):.1f}%)") | |
| if modern_wins: | |
| worst = max(modern_wins, key=modern_wins.get) | |
| parts.append(f"白話較省的有 {len(modern_wins)} 個(如 {worst} 省 {modern_wins[worst]:.1f}%)") | |
| if ties: | |
| parts.append(f"差距 <1% 的有 {len(ties)} 個") | |
| print(f" {n}. 跨模型結果不一致:{';'.join(parts)}") | |
| n += 1 | |
| # 3. 英文的表現 | |
| en_vs_c = {name: (1 - e / c) * 100 for name, (c, m, e) in all_results.items() if e > 0 and c > 0} | |
| en_vs_m = {name: (1 - e / m) * 100 for name, (c, m, e) in all_results.items() if e > 0 and m > 0} | |
| en_beats_c = all(v > 0 for v in en_vs_c.values()) # 英文是否在所有 tokenizer 上都比文言少 | |
| en_beats_m = all(v > 0 for v in en_vs_m.values()) # 英文是否在所有 tokenizer 上都比白話少 | |
| if en_beats_c and en_beats_m: | |
| range_min = int(min(min(en_vs_c.values()), min(en_vs_m.values()))) | |
| range_max = int(max(max(en_vs_c.values()), max(en_vs_m.values()))) | |
| print(f" {n}. 英文在所有 tokenizer 上都比中文省(省 {range_min}%~{range_max}%)") | |
| elif en_beats_c: | |
| range_min, range_max = int(min(en_vs_c.values())), int(max(en_vs_c.values())) | |
| print(f" {n}. 英文在所有 tokenizer 上都比文言省(省 {range_min}%~{range_max}%),但個別 tokenizer 上與白話互有勝負") | |
| else: | |
| print(f" {n}. 英文在多數 tokenizer 上比中文省 token,但非絕對(取決於 tokenizer 對中文的優化程度)") | |
| n += 1 | |
| # 4. 原因分析 | |
| print(f" {n}. 原因:BPE tokenizer 按訓練語料統計頻率切分") | |
| print(f" - 語料中出現頻率越高的字詞序列,越容易被合併成單一 token") | |
| print(f" - 文言生僻字可能被拆成多個 byte-level tokens,抵銷字數優勢") | |
| print(f" - 對中文優化較深的 tokenizer(如 Qwen),現代白話的效率更高") | |
| n += 1 | |
| # 5. 建議 | |
| print(f" {n}. 字數少 ≠ token 少。想省 token,關鍵是選用 tokenizer 訓練語料中的高頻詞彙,") | |
| print(f" 而非追求文體上的簡短") | |
| print() | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment