lis186 · March 27, 2026 03:38
diff --git a/gist_classical_chinese_token_test.py b/gist_classical_chinese_token_test.py
 """
 用文言，可減 token 乎？— 實測文言文 vs 白話文 vs English 的 Token 數

 起因：有人在 LinkedIn 用文言文寫了一篇「前端工程師修養論」，
 底下有人問 Haiku 4.5「用文言，可減 token 乎？」，Haiku 回「可也」。
 真的嗎？我們來實測。

 測試方法：
 - 取文言原文，翻譯成等義的白話文和英文
 - 用 5 種主流 tokenizer 分別計算 token 數
 - 逐句對比，找出差異來源

 需要安裝：
  pip install tiktoken tokenizers transformers

 結論（劇透）：
 - Claude tokenizer: 文言 406 vs 白話 406 → 完全一樣
 - 英文在所有 tokenizer 上都大幅領先（省 28%~62%）
 - 文言文字數少 19%，但 token 數沒省到
 - 「爪哇腳本」4 tokens，「JavaScript」1 token
 """

 import tiktoken
 from tokenizers import Tokenizer

 # ================================================================
 # 測試文本：同一篇內容的三種語體
 # ================================================================

 # 文言文原文 — 出自 LinkedIn 江中喬的貼文
 classical = """君子編程，必也求乎。求者，乃得解於人工智能也。
 汝乃一前端工程師也，於此，汝必撰一門面。

 夫頁首者，門面之梁也。重簡便，不宜混亂。夫佈局者，貴乎流動而不滯；樣式者，取其簡約而不雜。手執腳本之利刃，運響應於掌心。雖瀏覽器千差萬別，心法一貫，則相容無礙。

 若夫代碼之美，必也邏輯之嚴密、註釋之清明。

 不求浮誇之風，但求用戶之易貶也，老嫗能解為佳。

 故曰：工欲善其事，必先構其代碼、鞭其錯蟲、理其頭緒、空乏其版，所以動心忍性，增益吾所不能。

 余勉也，為得一佳文得傳千古，汝必莫使多紕漏，徒留殘蟲，貽笑大方。

 閱畢，即速寫超文本標記語言、階層樣式表及爪哇腳本，以達吾願、傳汝碼。"""

 # 白話文翻譯（等義）
 modern = """工程師寫程式，一定要追求品質。追求品質，就能從 AI 工具中獲得好的解答。
 你是一個前端工程師，所以你必須做好頁面。

 頁首是網站的門面，要簡潔，不要混亂。佈局要流暢、不能卡頓；樣式要簡約、不要繁雜。要熟練使用 JavaScript，靈活運用響應式設計。雖然瀏覽器各不相同，但核心方法是一致的，就能做到跨瀏覽器相容。

 好的代碼，一定要邏輯嚴密、註釋清楚。

 不追求花俏的風格，只要讓使用者容易理解就好，連不懂技術的人都能看懂最好。

 所以說：要做好工作，必須先寫好代碼、修好 bug、整理好思路、清理好版本，這樣才能磨練心性，提升自己的能力。

 我勉勵你，要寫出能流傳的好代碼，不要留下太多漏洞，否則留下一堆 bug，會被人笑話。

 看完之後，就趕快去寫 HTML、CSS 和 JavaScript，來實現目標、交付你的程式碼。"""

 # English translation (equivalent meaning)
 english = """Engineers should pursue quality in their code. By pursuing quality, you can get good answers from AI tools.
 You are a frontend engineer, so you must build a good interface.

 The header is the face of a website — keep it simple, not cluttered. Layout should flow smoothly without stalling; styles should be minimal, not messy. Wield JavaScript skillfully, and master responsive design. Though browsers vary widely, the core approach is consistent, and you can achieve cross-browser compatibility.

 Good code must have rigorous logic and clear comments.

 Don't pursue flashy styles — just make it easy for users to understand. If even a non-technical person can follow it, that's ideal.

 As the saying goes: to do good work, you must first write solid code, fix bugs, organize your thoughts, and clean up your versions. This is how you build resilience and improve your abilities.

 I encourage you: write code worthy of lasting, and don't leave too many holes behind — otherwise you'll be left with bugs and become a laughingstock.

 After reading this, go write HTML, CSS, and JavaScript right away, to achieve our goals and ship your code."""

 texts = {"文言文": classical, "白話文": modern, "English": english}


 # ================================================================
 # 載入 Tokenizers
 # ================================================================

 def load_tokenizers():
    tok_map = {}

    # Claude (Xenova/claude-tokenizer — 公開的 Claude BPE tokenizer)
    try:
        claude_tok = Tokenizer.from_pretrained("Xenova/claude-tokenizer")
        tok_map["Claude (all models)"] = lambda t: len(claude_tok.encode(t).ids)
    except Exception as e:
        print(f"⚠️  Claude tokenizer: {e}")

    # GPT-4 / GPT-3.5 Turbo
    enc_cl100k = tiktoken.get_encoding("cl100k_base")
    tok_map["GPT-4 / 3.5-Turbo (cl100k)"] = lambda t: len(enc_cl100k.encode(t))

    # GPT-4o / o1 / o3
    enc_o200k = tiktoken.get_encoding("o200k_base")
    tok_map["GPT-4o / o1 / o3 (o200k)"] = lambda t: len(enc_o200k.encode(t))

    # Qwen 2.5
    try:
        from transformers import AutoTokenizer
        qwen_tok = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B")
        tok_map["Qwen 2.5"] = lambda t: len(qwen_tok.encode(t))
    except Exception:
        pass

    # DeepSeek V2
    try:
        from transformers import AutoTokenizer
        ds_tok = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-V2-Lite")
        tok_map["DeepSeek V2"] = lambda t: len(ds_tok.encode(t))
    except Exception:
        pass

    return tok_map


 # ================================================================
 # 主程式
 # ================================================================

 def main():
    tok_map = load_tokenizers()

    def char_count(t):
        return len(t.replace("\n", "").replace(" ", ""))

    # --- 總表 ---
    print("=" * 85)
    print("📊 文言文 vs 白話文 vs English — 跨模型 Token 數完整對比")
    print("=" * 85)

    print(f"\n📏 文本長度:")
    for label, text in texts.items():
        print(f"   {label}: {char_count(text)} 字元 / {len(text.encode('utf-8'))} bytes")

    header = f"\n{'Tokenizer':<32} {'文言文':>7} {'白話文':>7} {'English':>7} │ {'文vs白':>7} {'文vs英':>7} {'白vs英':>7}"
    print(header)
    print("─" * len(header.encode("ascii", "ignore").decode()))

    all_results = {}  # name -> (classical_tokens, modern_tokens, english_tokens)
    for name, fn in tok_map.items():
        c = fn(texts["文言文"])
        m = fn(texts["白話文"])
        e = fn(texts["English"])
        all_results[name] = (c, m, e)
        cm = f"{(1-c/m)*100:+.1f}%"
        ce = f"{(1-c/e)*100:+.1f}%"
        me = f"{(1-m/e)*100:+.1f}%"
        print(f"{name:<32} {c:>7} {m:>7} {e:>7} │ {cm:>7} {ce:>7} {me:>7}")

    print(f"\n  +值 = 省了 token（前者比後者少）")
    print(f"  -值 = 多用 token（前者比後者多）")

    print(f"\n📝 GPT 模型 → tokenizer 對應:")
    print(f"   cl100k_base → GPT-4, GPT-4 Turbo, GPT-3.5 Turbo")
    print(f"   o200k_base  → GPT-4o, GPT-4o-mini, o1, o1-mini, o3, o3-mini")

    # --- 逐句對比 ---
    try:
        claude_tok = Tokenizer.from_pretrained("Xenova/claude-tokenizer")
    except Exception:
        print("\n（Claude tokenizer 不可用，跳過逐句對比）")
        return

    print("\n" + "=" * 85)
    print("📝 逐句對比 (Claude tokenizer)")
    print("=" * 85)

    c_lines = [l.strip() for l in classical.strip().split("\n") if l.strip()]
    m_lines = [l.strip() for l in modern.strip().split("\n") if l.strip()]
    e_lines = [l.strip() for l in english.strip().split("\n") if l.strip()]

    max_len = max(len(c_lines), len(m_lines), len(e_lines))
    c_lines += [""] * (max_len - len(c_lines))
    m_lines += [""] * (max_len - len(m_lines))
    e_lines += [""] * (max_len - len(e_lines))

    print(f"\n{'#':<3} {'文言':>6} {'白話':>6} {'英文':>6} │ {'最少':>4}  摘要")
    print("─" * 75)

    total = {"文言": 0, "白話": 0, "英文": 0}
    for i, (c, m, e) in enumerate(zip(c_lines, m_lines, e_lines)):
        ct = len(claude_tok.encode(c).ids) if c else 0
        mt = len(claude_tok.encode(m).ids) if m else 0
        et = len(claude_tok.encode(e).ids) if e else 0
        total["文言"] += ct
        total["白話"] += mt
        total["英文"] += et

        vals = {"文言": ct, "白話": mt, "英文": et}
        winner = min(vals, key=vals.get) if all(v > 0 for v in vals.values()) else "—"
        snippet = c[:28] if c else ""
        print(f"{i+1:<3} {ct:>6} {mt:>6} {et:>6} │ {winner:>4}  {snippet}{'…' if len(c)>28 else ''}")

    print("─" * 75)
    winner = min(total, key=total.get)
    print(f"{'Σ':<3} {total['文言']:>6} {total['白話']:>6} {total['英文']:>6} │ {winner:>4}")

    # --- 經典案例：技術術語 ---
    print("\n" + "=" * 85)
    print("🔬 案例分析：技術術語的 token 切分")
    print("=" * 85)

    cases = [
        ("文言", "閱畢，即速寫超文本標記語言、階層樣式表及爪哇腳本，以達吾願、傳汝碼。"),
        ("白話", "看完之後，就趕快去寫 HTML、CSS 和 JavaScript，來實現目標、交付你的程式碼。"),
        ("英文", "After reading this, go write HTML, CSS, and JavaScript right away, to achieve our goals and ship your code."),
    ]

    for label, text in cases:
        result = claude_tok.encode(text)
        print(f"\n  {label} ({len(result.ids)} tokens): {text}")
        print(f"  切分: {result.tokens}")

    # --- 結論 ---
    print("\n" + "=" * 85)
    print("💡 結論")
    print("=" * 85)

    c_chars = char_count(texts["文言文"])
    m_chars = char_count(texts["白話文"])
    char_saving = (1 - c_chars / m_chars) * 100

    n = 1

    # 1. 字數 vs token 數的落差
    claude_key = next((k for k in all_results if "Claude" in k), None)
    if claude_key:
        cc, cm, ce = all_results[claude_key]
        diff_pct = (cc / cm - 1) * 100
        if abs(diff_pct) < 1:
            verdict = f"完全一樣（{cc} vs {cm}）"
        elif diff_pct > 0:
            verdict = f"文言反而多 {diff_pct:.1f}%（{cc} vs {cm}）"
        else:
            verdict = f"文言少 {abs(diff_pct):.1f}%（{cc} vs {cm}）"
        print(f"\n  {n}. 文言文字數少 {char_saving:.0f}%，但 token 數在 Claude 上{verdict}")
        n += 1

    # 2. 跨模型比較文言 vs 白話
    cm_ratios = {name: (c / m - 1) * 100 for name, (c, m, e) in all_results.items() if m > 0}
    classical_wins = {k: v for k, v in cm_ratios.items() if v < -1}   # 文言省 >1%
    modern_wins = {k: v for k, v in cm_ratios.items() if v > 1}       # 白話省 >1%
    ties = {k: v for k, v in cm_ratios.items() if abs(v) <= 1}        # 差距 <=1%

    if classical_wins and not modern_wins:
        best_name = min(classical_wins, key=classical_wins.get)
        print(f"  {n}. 文言在所有 tokenizer 上都更省 token（最多：{best_name} 省 {abs(classical_wins[best_name]):.1f}%）")
    elif modern_wins and not classical_wins:
        worst_name = max(modern_wins, key=modern_wins.get)
        print(f"  {n}. 白話在所有 tokenizer 上都更省 token（最多：{worst_name} 省 {modern_wins[worst_name]:.1f}%）")
    else:
        # 混合結果：有的文言贏，有的白話贏
        parts = []
        if classical_wins:
            best = min(classical_wins, key=classical_wins.get)
            parts.append(f"文言較省的有 {len(classical_wins)} 個（如 {best} 省 {abs(classical_wins[best]):.1f}%）")
        if modern_wins:
            worst = max(modern_wins, key=modern_wins.get)
            parts.append(f"白話較省的有 {len(modern_wins)} 個（如 {worst} 省 {modern_wins[worst]:.1f}%）")
        if ties:
            parts.append(f"差距 <1% 的有 {len(ties)} 個")
        print(f"  {n}. 跨模型結果不一致：{'；'.join(parts)}")
    n += 1

    # 3. 英文的表現
    en_vs_c = {name: (1 - e / c) * 100 for name, (c, m, e) in all_results.items() if e > 0 and c > 0}
    en_vs_m = {name: (1 - e / m) * 100 for name, (c, m, e) in all_results.items() if e > 0 and m > 0}

    en_beats_c = all(v > 0 for v in en_vs_c.values())  # 英文是否在所有 tokenizer 上都比文言少
    en_beats_m = all(v > 0 for v in en_vs_m.values())  # 英文是否在所有 tokenizer 上都比白話少

    if en_beats_c and en_beats_m:
        range_min = int(min(min(en_vs_c.values()), min(en_vs_m.values())))
        range_max = int(max(max(en_vs_c.values()), max(en_vs_m.values())))
        print(f"  {n}. 英文在所有 tokenizer 上都比中文省（省 {range_min}%~{range_max}%）")
    elif en_beats_c:
        range_min, range_max = int(min(en_vs_c.values())), int(max(en_vs_c.values()))
        print(f"  {n}. 英文在所有 tokenizer 上都比文言省（省 {range_min}%~{range_max}%），但個別 tokenizer 上與白話互有勝負")
    else:
        print(f"  {n}. 英文在多數 tokenizer 上比中文省 token，但非絕對（取決於 tokenizer 對中文的優化程度）")
    n += 1

    # 4. 原因分析
    print(f"  {n}. 原因：BPE tokenizer 按訓練語料統計頻率切分")
    print(f"     - 語料中出現頻率越高的字詞序列，越容易被合併成單一 token")
    print(f"     - 文言生僻字可能被拆成多個 byte-level tokens，抵銷字數優勢")
    print(f"     - 對中文優化較深的 tokenizer（如 Qwen），現代白話的效率更高")
    n += 1

    # 5. 建議
    print(f"  {n}. 字數少 ≠ token 少。想省 token，關鍵是選用 tokenizer 訓練語料中的高頻詞彙，")
    print(f"     而非追求文體上的簡短")
    print()


 if __name__ == "__main__":
    main()
	"""
	用文言，可減 token 乎？— 實測文言文 vs 白話文 vs English 的 Token 數

	起因：有人在 LinkedIn 用文言文寫了一篇「前端工程師修養論」，
	底下有人問 Haiku 4.5「用文言，可減 token 乎？」，Haiku 回「可也」。
	真的嗎？我們來實測。

	測試方法：
	- 取文言原文，翻譯成等義的白話文和英文
	- 用 5 種主流 tokenizer 分別計算 token 數
	- 逐句對比，找出差異來源

	需要安裝：
	pip install tiktoken tokenizers transformers

	結論（劇透）：
	- Claude tokenizer: 文言 406 vs 白話 406 → 完全一樣
	- 英文在所有 tokenizer 上都大幅領先（省 28%~62%）
	- 文言文字數少 19%，但 token 數沒省到
	- 「爪哇腳本」4 tokens，「JavaScript」1 token
	"""

	import tiktoken
	from tokenizers import Tokenizer

	# ================================================================
	# 測試文本：同一篇內容的三種語體
	# ================================================================

	# 文言文原文 — 出自 LinkedIn 江中喬的貼文
	classical = """君子編程，必也求乎。求者，乃得解於人工智能也。
	汝乃一前端工程師也，於此，汝必撰一門面。

	夫頁首者，門面之梁也。重簡便，不宜混亂。夫佈局者，貴乎流動而不滯；樣式者，取其簡約而不雜。手執腳本之利刃，運響應於掌心。雖瀏覽器千差萬別，心法一貫，則相容無礙。

	若夫代碼之美，必也邏輯之嚴密、註釋之清明。

	不求浮誇之風，但求用戶之易貶也，老嫗能解為佳。

	故曰：工欲善其事，必先構其代碼、鞭其錯蟲、理其頭緒、空乏其版，所以動心忍性，增益吾所不能。

	余勉也，為得一佳文得傳千古，汝必莫使多紕漏，徒留殘蟲，貽笑大方。

	閱畢，即速寫超文本標記語言、階層樣式表及爪哇腳本，以達吾願、傳汝碼。"""

	# 白話文翻譯（等義）
	modern = """工程師寫程式，一定要追求品質。追求品質，就能從 AI 工具中獲得好的解答。
	你是一個前端工程師，所以你必須做好頁面。

	頁首是網站的門面，要簡潔，不要混亂。佈局要流暢、不能卡頓；樣式要簡約、不要繁雜。要熟練使用 JavaScript，靈活運用響應式設計。雖然瀏覽器各不相同，但核心方法是一致的，就能做到跨瀏覽器相容。

	好的代碼，一定要邏輯嚴密、註釋清楚。

	不追求花俏的風格，只要讓使用者容易理解就好，連不懂技術的人都能看懂最好。

	所以說：要做好工作，必須先寫好代碼、修好 bug、整理好思路、清理好版本，這樣才能磨練心性，提升自己的能力。

	我勉勵你，要寫出能流傳的好代碼，不要留下太多漏洞，否則留下一堆 bug，會被人笑話。

	看完之後，就趕快去寫 HTML、CSS 和 JavaScript，來實現目標、交付你的程式碼。"""

	# English translation (equivalent meaning)
	english = """Engineers should pursue quality in their code. By pursuing quality, you can get good answers from AI tools.
	You are a frontend engineer, so you must build a good interface.

	The header is the face of a website — keep it simple, not cluttered. Layout should flow smoothly without stalling; styles should be minimal, not messy. Wield JavaScript skillfully, and master responsive design. Though browsers vary widely, the core approach is consistent, and you can achieve cross-browser compatibility.

	Good code must have rigorous logic and clear comments.

	Don't pursue flashy styles — just make it easy for users to understand. If even a non-technical person can follow it, that's ideal.

	As the saying goes: to do good work, you must first write solid code, fix bugs, organize your thoughts, and clean up your versions. This is how you build resilience and improve your abilities.

	I encourage you: write code worthy of lasting, and don't leave too many holes behind — otherwise you'll be left with bugs and become a laughingstock.

	After reading this, go write HTML, CSS, and JavaScript right away, to achieve our goals and ship your code."""

	texts = {"文言文": classical, "白話文": modern, "English": english}


	# ================================================================
	# 載入 Tokenizers
	# ================================================================

	def load_tokenizers():
	tok_map = {}

	# Claude (Xenova/claude-tokenizer — 公開的 Claude BPE tokenizer)
	try:
	claude_tok = Tokenizer.from_pretrained("Xenova/claude-tokenizer")
	tok_map["Claude (all models)"] = lambda t: len(claude_tok.encode(t).ids)
	except Exception as e:
	print(f"⚠️ Claude tokenizer: {e}")

	# GPT-4 / GPT-3.5 Turbo
	enc_cl100k = tiktoken.get_encoding("cl100k_base")
	tok_map["GPT-4 / 3.5-Turbo (cl100k)"] = lambda t: len(enc_cl100k.encode(t))

	# GPT-4o / o1 / o3
	enc_o200k = tiktoken.get_encoding("o200k_base")
	tok_map["GPT-4o / o1 / o3 (o200k)"] = lambda t: len(enc_o200k.encode(t))

	# Qwen 2.5
	try:
	from transformers import AutoTokenizer
	qwen_tok = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B")
	tok_map["Qwen 2.5"] = lambda t: len(qwen_tok.encode(t))
	except Exception:
	pass

	# DeepSeek V2
	try:
	from transformers import AutoTokenizer
	ds_tok = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-V2-Lite")
	tok_map["DeepSeek V2"] = lambda t: len(ds_tok.encode(t))
	except Exception:
	pass

	return tok_map


	# ================================================================
	# 主程式
	# ================================================================

	def main():
	tok_map = load_tokenizers()

	def char_count(t):
	return len(t.replace("\n", "").replace(" ", ""))

	# --- 總表 ---
	print("=" * 85)
	print("📊 文言文 vs 白話文 vs English — 跨模型 Token 數完整對比")
	print("=" * 85)

	print(f"\n📏 文本長度:")
	for label, text in texts.items():
	print(f" {label}: {char_count(text)} 字元 / {len(text.encode('utf-8'))} bytes")

	header = f"\n{'Tokenizer':<32} {'文言文':>7} {'白話文':>7} {'English':>7} │ {'文vs白':>7} {'文vs英':>7} {'白vs英':>7}"
	print(header)
	print("─" * len(header.encode("ascii", "ignore").decode()))

	all_results = {} # name -> (classical_tokens, modern_tokens, english_tokens)
	for name, fn in tok_map.items():
	c = fn(texts["文言文"])
	m = fn(texts["白話文"])
	e = fn(texts["English"])
	all_results[name] = (c, m, e)
	cm = f"{(1-c/m)*100:+.1f}%"
	ce = f"{(1-c/e)*100:+.1f}%"
	me = f"{(1-m/e)*100:+.1f}%"
	print(f"{name:<32} {c:>7} {m:>7} {e:>7} │ {cm:>7} {ce:>7} {me:>7}")

	print(f"\n +值 = 省了 token（前者比後者少）")
	print(f" -值 = 多用 token（前者比後者多）")

	print(f"\n📝 GPT 模型 → tokenizer 對應:")
	print(f" cl100k_base → GPT-4, GPT-4 Turbo, GPT-3.5 Turbo")
	print(f" o200k_base → GPT-4o, GPT-4o-mini, o1, o1-mini, o3, o3-mini")

	# --- 逐句對比 ---
	try:
	claude_tok = Tokenizer.from_pretrained("Xenova/claude-tokenizer")
	except Exception:
	print("\n（Claude tokenizer 不可用，跳過逐句對比）")
	return

	print("\n" + "=" * 85)
	print("📝 逐句對比 (Claude tokenizer)")
	print("=" * 85)

	c_lines = [l.strip() for l in classical.strip().split("\n") if l.strip()]
	m_lines = [l.strip() for l in modern.strip().split("\n") if l.strip()]
	e_lines = [l.strip() for l in english.strip().split("\n") if l.strip()]

	max_len = max(len(c_lines), len(m_lines), len(e_lines))
	c_lines += [""] * (max_len - len(c_lines))
	m_lines += [""] * (max_len - len(m_lines))
	e_lines += [""] * (max_len - len(e_lines))

	print(f"\n{'#':<3} {'文言':>6} {'白話':>6} {'英文':>6} │ {'最少':>4} 摘要")
	print("─" * 75)

	total = {"文言": 0, "白話": 0, "英文": 0}
	for i, (c, m, e) in enumerate(zip(c_lines, m_lines, e_lines)):
	ct = len(claude_tok.encode(c).ids) if c else 0
	mt = len(claude_tok.encode(m).ids) if m else 0
	et = len(claude_tok.encode(e).ids) if e else 0
	total["文言"] += ct
	total["白話"] += mt
	total["英文"] += et

	vals = {"文言": ct, "白話": mt, "英文": et}
	winner = min(vals, key=vals.get) if all(v > 0 for v in vals.values()) else "—"
	snippet = c[:28] if c else ""
	print(f"{i+1:<3} {ct:>6} {mt:>6} {et:>6} │ {winner:>4} {snippet}{'…' if len(c)>28 else ''}")

	print("─" * 75)
	winner = min(total, key=total.get)
	print(f"{'Σ':<3} {total['文言']:>6} {total['白話']:>6} {total['英文']:>6} │ {winner:>4}")

	# --- 經典案例：技術術語 ---
	print("\n" + "=" * 85)
	print("🔬 案例分析：技術術語的 token 切分")
	print("=" * 85)

	cases = [
	("文言", "閱畢，即速寫超文本標記語言、階層樣式表及爪哇腳本，以達吾願、傳汝碼。"),
	("白話", "看完之後，就趕快去寫 HTML、CSS 和 JavaScript，來實現目標、交付你的程式碼。"),
	("英文", "After reading this, go write HTML, CSS, and JavaScript right away, to achieve our goals and ship your code."),
	]

	for label, text in cases:
	result = claude_tok.encode(text)
	print(f"\n {label} ({len(result.ids)} tokens): {text}")
	print(f" 切分: {result.tokens}")

	# --- 結論 ---
	print("\n" + "=" * 85)
	print("💡 結論")
	print("=" * 85)

	c_chars = char_count(texts["文言文"])
	m_chars = char_count(texts["白話文"])
	char_saving = (1 - c_chars / m_chars) * 100

	n = 1

	# 1. 字數 vs token 數的落差
	claude_key = next((k for k in all_results if "Claude" in k), None)
	if claude_key:
	cc, cm, ce = all_results[claude_key]
	diff_pct = (cc / cm - 1) * 100
	if abs(diff_pct) < 1:
	verdict = f"完全一樣（{cc} vs {cm}）"
	elif diff_pct > 0:
	verdict = f"文言反而多 {diff_pct:.1f}%（{cc} vs {cm}）"
	else:
	verdict = f"文言少 {abs(diff_pct):.1f}%（{cc} vs {cm}）"
	print(f"\n {n}. 文言文字數少 {char_saving:.0f}%，但 token 數在 Claude 上{verdict}")
	n += 1

	# 2. 跨模型比較文言 vs 白話
	cm_ratios = {name: (c / m - 1) * 100 for name, (c, m, e) in all_results.items() if m > 0}
	classical_wins = {k: v for k, v in cm_ratios.items() if v < -1} # 文言省 >1%
	modern_wins = {k: v for k, v in cm_ratios.items() if v > 1} # 白話省 >1%
	ties = {k: v for k, v in cm_ratios.items() if abs(v) <= 1} # 差距 <=1%

	if classical_wins and not modern_wins:
	best_name = min(classical_wins, key=classical_wins.get)
	print(f" {n}. 文言在所有 tokenizer 上都更省 token（最多：{best_name} 省 {abs(classical_wins[best_name]):.1f}%）")
	elif modern_wins and not classical_wins:
	worst_name = max(modern_wins, key=modern_wins.get)
	print(f" {n}. 白話在所有 tokenizer 上都更省 token（最多：{worst_name} 省 {modern_wins[worst_name]:.1f}%）")
	else:
	# 混合結果：有的文言贏，有的白話贏
	parts = []
	if classical_wins:
	best = min(classical_wins, key=classical_wins.get)
	parts.append(f"文言較省的有 {len(classical_wins)} 個（如 {best} 省 {abs(classical_wins[best]):.1f}%）")
	if modern_wins:
	worst = max(modern_wins, key=modern_wins.get)
	parts.append(f"白話較省的有 {len(modern_wins)} 個（如 {worst} 省 {modern_wins[worst]:.1f}%）")
	if ties:
	parts.append(f"差距 <1% 的有 {len(ties)} 個")
	print(f" {n}. 跨模型結果不一致：{'；'.join(parts)}")
	n += 1

	# 3. 英文的表現
	en_vs_c = {name: (1 - e / c) * 100 for name, (c, m, e) in all_results.items() if e > 0 and c > 0}
	en_vs_m = {name: (1 - e / m) * 100 for name, (c, m, e) in all_results.items() if e > 0 and m > 0}

	en_beats_c = all(v > 0 for v in en_vs_c.values()) # 英文是否在所有 tokenizer 上都比文言少
	en_beats_m = all(v > 0 for v in en_vs_m.values()) # 英文是否在所有 tokenizer 上都比白話少

	if en_beats_c and en_beats_m:
	range_min = int(min(min(en_vs_c.values()), min(en_vs_m.values())))
	range_max = int(max(max(en_vs_c.values()), max(en_vs_m.values())))
	print(f" {n}. 英文在所有 tokenizer 上都比中文省（省 {range_min}%~{range_max}%）")
	elif en_beats_c:
	range_min, range_max = int(min(en_vs_c.values())), int(max(en_vs_c.values()))
	print(f" {n}. 英文在所有 tokenizer 上都比文言省（省 {range_min}%~{range_max}%），但個別 tokenizer 上與白話互有勝負")
	else:
	print(f" {n}. 英文在多數 tokenizer 上比中文省 token，但非絕對（取決於 tokenizer 對中文的優化程度）")
	n += 1

	# 4. 原因分析
	print(f" {n}. 原因：BPE tokenizer 按訓練語料統計頻率切分")
	print(f" - 語料中出現頻率越高的字詞序列，越容易被合併成單一 token")
	print(f" - 文言生僻字可能被拆成多個 byte-level tokens，抵銷字數優勢")
	print(f" - 對中文優化較深的 tokenizer（如 Qwen），現代白話的效率更高")
	n += 1

	# 5. 建議
	print(f" {n}. 字數少 ≠ token 少。想省 token，關鍵是選用 tokenizer 訓練語料中的高頻詞彙，")
	print(f" 而非追求文體上的簡短")
	print()


	if __name__ == "__main__":
	main()
No results found