Skip to content

Instantly share code, notes, and snippets.

@hugocbp
Created March 13, 2025 22:47
Show Gist options
  • Select an option

  • Save hugocbp/6c8a196b743e57ed9e4a4a11b362c579 to your computer and use it in GitHub Desktop.

Select an option

Save hugocbp/6c8a196b743e57ed9e4a4a11b362c579 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
"""
Standalone script to test Unicode apostrophe encoding issues.
This reproduces the "It's" → "It’s" encoding issue and tests various encodings.
Run directly with: python encoding_check.py
"""
def check_unicode_apostrophe_encoding():
"""Test how encoding issues transform "It's" into "It’s"."""
print("\n=== BASIC APOSTROPHE ENCODING TEST ===")
# Original string with Unicode right single quotation mark (U+2019)
# Using explicit Unicode escape to ensure we have the right character
original_string = "It’s"
assert original_string == "It\u2019s"
print(f"Original string: {original_string}")
print(f"Unicode codepoint for apostrophe: U+{ord(original_string[2]):04X}")
assert ord(original_string[2]) == 0x2019, (
f"Not using U+2019, got: {hex(ord(original_string[2]))}"
)
# Step 1: Encode the string as UTF-8
utf8_bytes = original_string.encode("utf-8")
print(f"UTF-8 bytes: {utf8_bytes}")
print(f"UTF-8 byte representation: {[hex(b) for b in utf8_bytes]}")
# Step 2: Decode the UTF-8 bytes incorrectly as Windows-1252
windows1252_decoded = utf8_bytes.decode("cp1252", errors="replace")
print(f"Windows-1252 decoded: {windows1252_decoded}")
# Check if it matches the expected output
if windows1252_decoded == "It’s":
print("✓ MATCH! Windows-1252 decoding produces exactly 'It’s'")
else:
print(f"✗ NO MATCH. Expected 'It’s' but got '{windows1252_decoded}'")
# Step 3: Try to fix it by re-encoding with Windows-1252 and decoding with UTF-8
fixed_string = windows1252_decoded.encode("cp1252").decode(
"utf-8", errors="replace"
)
print(f"Fixed string: {fixed_string}")
if fixed_string == original_string:
print("✓ Successfully fixed back to original string")
else:
print(f"✗ Could not fix back exactly. Got: '{fixed_string}'")
def check_different_apostrophe_types():
"""Compare ASCII vs Unicode apostrophes."""
print("\n=== COMPARING DIFFERENT APOSTROPHE TYPES ===")
# Regular ASCII apostrophe
ascii_apostrophe = "It's" # ASCII apostrophe (U+0027)
print(f"ASCII apostrophe: {ascii_apostrophe}")
print(f"ASCII apostrophe codepoint: U+{ord(ascii_apostrophe[2]):04X}")
assert ord(ascii_apostrophe[2]) == 0x0027, "Not ASCII apostrophe"
ascii_bytes = ascii_apostrophe.encode("utf-8")
print(f"ASCII UTF-8 bytes: {[hex(b) for b in ascii_bytes]}")
# Decode with cp1252 - shouldn't change
ascii_decoded = ascii_bytes.decode("cp1252")
print(f"ASCII apostrophe decoded with cp1252: {ascii_decoded}")
# Unicode right single quotation mark
unicode_apostrophe = "It\u2019s" # Unicode right single quote (U+2019)
print(f"\nUnicode apostrophe: {unicode_apostrophe}")
print(f"Unicode apostrophe codepoint: U+{ord(unicode_apostrophe[2]):04X}")
assert ord(unicode_apostrophe[2]) == 0x2019, (
"Not Unicode Right Single Quotation Mark"
)
unicode_bytes = unicode_apostrophe.encode("utf-8")
print(f"Unicode UTF-8 bytes: {[hex(b) for b in unicode_bytes]}")
# Decode with cp1252 - should produce the issue
unicode_decoded = unicode_bytes.decode("cp1252", errors="replace")
print(f"Unicode apostrophe decoded with cp1252: {unicode_decoded}")
if unicode_decoded == "It’s":
print("✓ MATCH! Windows-1252 decoding produces exactly 'It’s'")
# For comparison, latin-1
latin1_decoded = unicode_bytes.decode("latin-1", errors="replace")
print(f"Unicode apostrophe decoded with latin-1: {latin1_decoded}")
if latin1_decoded == "It’s":
print("✓ MATCH! Latin-1 decoding produces exactly 'It’s'")
def check_mac_specific_encodings():
"""Test Mac-specific encodings that might cause the same issue."""
print("\n=== TESTING MAC-SPECIFIC ENCODINGS ===")
# Original string with Unicode right single quotation mark - explicitly using Unicode escape
original_string = "It\u2019s"
print(f"Testing with original string: {original_string}")
print(f"Unicode codepoint for apostrophe: U+{ord(original_string[2]):04X}")
assert ord(original_string[2]) == 0x2019, "Not using correct Unicode character"
# The UTF-8 encoded bytes for this string
utf8_bytes = original_string.encode("utf-8")
print(f"UTF-8 bytes: {utf8_bytes}")
print(f"UTF-8 byte representation: {[hex(b) for b in utf8_bytes]}")
# Try different encodings that might be used on Mac systems
encodings_to_test = [
"cp1252", # Windows-1252
"latin-1", # ISO-8859-1
"macroman", # Traditional Mac encoding
"ascii", # ASCII with replacement for non-ASCII chars
"utf-16", # UTF-16 (common for OS interfaces)
"utf-16-le", # UTF-16 Little Endian
"utf-16-be", # UTF-16 Big Endian
]
print("\nEncoding Test Results:")
print("-" * 50)
# Test scenario 1: UTF-8 encoded text decoded incorrectly with other encodings
print("\nScenario 1: UTF-8 encoded text decoded incorrectly:")
for encoding in encodings_to_test:
try:
decoded = utf8_bytes.decode(encoding, errors="replace")
print(f" {encoding:<10}: {decoded}")
if decoded == "It’s":
print(f" ✓ MATCH FOUND! {encoding} produces exactly 'It’s'")
except Exception as e:
print(f" {encoding:<10}: Error - {str(e)}")
# Test scenario 2: Original string encoded with various encodings then decoded as UTF-8
print("\nScenario 2: Different encodings decoded as UTF-8:")
for encoding in encodings_to_test:
try:
# Some encodings might not support the Unicode apostrophe and will replace it
encoded = original_string.encode(encoding, errors="replace")
# Then decode back as UTF-8 (which might cause issues if UTF-8 is expected)
decoded = encoded.decode("utf-8", errors="replace")
print(f" {encoding:<10}: {decoded}")
if decoded == "It’s":
print(f" ✓ MATCH FOUND! {encoding} produces exactly 'It’s'")
except Exception as e:
print(f" {encoding:<10}: Error - {str(e)}")
# HTML entities - sometimes browsers display Unicode differently
try:
import html
html_encoded = html.escape(original_string)
print(f"\nHTML encoded: {html_encoded}")
except Exception as e:
print(f"HTML test: Error - {str(e)}")
# Test what happens with double encoding/decoding errors
try:
# Step 1: UTF-8 → latin1 (wrong) → UTF-8 (attempt to fix) → latin1 (wrong again)
step1 = utf8_bytes.decode("latin-1")
step2 = step1.encode("utf-8")
step3 = step2.decode("latin-1")
print(f"\nDouble encoding error: {step3}")
if step3 == "It’s":
print(" ✓ MATCH FOUND! Double encoding error produces exactly 'It’s'")
except Exception as e:
print(f"Double encoding test: Error - {str(e)}")
def check_common_web_situations():
"""Test common web/browser encoding scenarios."""
print("\n=== TESTING COMMON WEB SCENARIOS ===")
# Original string with Unicode right single quotation mark
original_string = "It\u2019s"
print(f"Original string: {original_string}")
assert ord(original_string[2]) == 0x2019, "Not using correct Unicode character"
# Database connection charset issues (e.g., MySQL with wrong connection charset)
print("\nDatabase connection scenarios:")
# MySQL with UTF8 data but latin1 connection
utf8_bytes = original_string.encode("utf-8")
mysql_latin1_connection = utf8_bytes.decode("latin-1")
print(f"MySQL UTF8 data with latin1 connection: {mysql_latin1_connection}")
# MySQL with latin1 data incorrectly treated as UTF8
try:
latin1_bytes = original_string.encode("latin-1", errors="replace")
mysql_utf8_misconfig = latin1_bytes.decode("utf-8", errors="replace")
print(f"MySQL latin1 data treated as UTF8: {mysql_utf8_misconfig}")
except Exception as e:
print(f"MySQL latin1->UTF8 error: {str(e)}")
# JSON encoding/decoding with wrong charset
print("\nJSON encoding scenarios:")
try:
import json
# Correctly encoded JSON
json_str = json.dumps({"text": original_string})
print(f"JSON correctly encoded: {json_str}")
# What if we encode the already broken string?
broken = utf8_bytes.decode("cp1252", errors="replace")
json_broken = json.dumps({"text": broken})
print(f"JSON with already broken text: {json_broken}")
except Exception as e:
print(f"JSON test error: {str(e)}")
# HTTP header encoding issues
print("\nHTTP header scenarios:")
# Content-Type: text/html; charset=latin1 but content is UTF-8
latin1_interpreted = utf8_bytes.decode("latin-1")
print(f"HTTP UTF-8 content with latin1 charset header: {latin1_interpreted}")
if latin1_interpreted == mysql_latin1_connection:
print("✓ MATCH! Latin-1 and MySQL latin1 produce the same result")
if __name__ == "__main__":
print("Starting encoding tests...")
check_unicode_apostrophe_encoding()
check_different_apostrophe_types()
check_mac_specific_encodings()
check_common_web_situations()
print("\nTests completed.")
@hugocbp
Copy link
Author

hugocbp commented Mar 13, 2025

Ad hoc Python script to try to figure out which encoding a client who reported the error was using to see the exact output sent in the screenshot.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment