Created
March 13, 2025 22:47
-
-
Save hugocbp/6c8a196b743e57ed9e4a4a11b362c579 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """ | |
| Standalone script to test Unicode apostrophe encoding issues. | |
| This reproduces the "It's" → "It’s" encoding issue and tests various encodings. | |
| Run directly with: python encoding_check.py | |
| """ | |
| def check_unicode_apostrophe_encoding(): | |
| """Test how encoding issues transform "It's" into "It’s".""" | |
| print("\n=== BASIC APOSTROPHE ENCODING TEST ===") | |
| # Original string with Unicode right single quotation mark (U+2019) | |
| # Using explicit Unicode escape to ensure we have the right character | |
| original_string = "It’s" | |
| assert original_string == "It\u2019s" | |
| print(f"Original string: {original_string}") | |
| print(f"Unicode codepoint for apostrophe: U+{ord(original_string[2]):04X}") | |
| assert ord(original_string[2]) == 0x2019, ( | |
| f"Not using U+2019, got: {hex(ord(original_string[2]))}" | |
| ) | |
| # Step 1: Encode the string as UTF-8 | |
| utf8_bytes = original_string.encode("utf-8") | |
| print(f"UTF-8 bytes: {utf8_bytes}") | |
| print(f"UTF-8 byte representation: {[hex(b) for b in utf8_bytes]}") | |
| # Step 2: Decode the UTF-8 bytes incorrectly as Windows-1252 | |
| windows1252_decoded = utf8_bytes.decode("cp1252", errors="replace") | |
| print(f"Windows-1252 decoded: {windows1252_decoded}") | |
| # Check if it matches the expected output | |
| if windows1252_decoded == "It’s": | |
| print("✓ MATCH! Windows-1252 decoding produces exactly 'It’s'") | |
| else: | |
| print(f"✗ NO MATCH. Expected 'It’s' but got '{windows1252_decoded}'") | |
| # Step 3: Try to fix it by re-encoding with Windows-1252 and decoding with UTF-8 | |
| fixed_string = windows1252_decoded.encode("cp1252").decode( | |
| "utf-8", errors="replace" | |
| ) | |
| print(f"Fixed string: {fixed_string}") | |
| if fixed_string == original_string: | |
| print("✓ Successfully fixed back to original string") | |
| else: | |
| print(f"✗ Could not fix back exactly. Got: '{fixed_string}'") | |
| def check_different_apostrophe_types(): | |
| """Compare ASCII vs Unicode apostrophes.""" | |
| print("\n=== COMPARING DIFFERENT APOSTROPHE TYPES ===") | |
| # Regular ASCII apostrophe | |
| ascii_apostrophe = "It's" # ASCII apostrophe (U+0027) | |
| print(f"ASCII apostrophe: {ascii_apostrophe}") | |
| print(f"ASCII apostrophe codepoint: U+{ord(ascii_apostrophe[2]):04X}") | |
| assert ord(ascii_apostrophe[2]) == 0x0027, "Not ASCII apostrophe" | |
| ascii_bytes = ascii_apostrophe.encode("utf-8") | |
| print(f"ASCII UTF-8 bytes: {[hex(b) for b in ascii_bytes]}") | |
| # Decode with cp1252 - shouldn't change | |
| ascii_decoded = ascii_bytes.decode("cp1252") | |
| print(f"ASCII apostrophe decoded with cp1252: {ascii_decoded}") | |
| # Unicode right single quotation mark | |
| unicode_apostrophe = "It\u2019s" # Unicode right single quote (U+2019) | |
| print(f"\nUnicode apostrophe: {unicode_apostrophe}") | |
| print(f"Unicode apostrophe codepoint: U+{ord(unicode_apostrophe[2]):04X}") | |
| assert ord(unicode_apostrophe[2]) == 0x2019, ( | |
| "Not Unicode Right Single Quotation Mark" | |
| ) | |
| unicode_bytes = unicode_apostrophe.encode("utf-8") | |
| print(f"Unicode UTF-8 bytes: {[hex(b) for b in unicode_bytes]}") | |
| # Decode with cp1252 - should produce the issue | |
| unicode_decoded = unicode_bytes.decode("cp1252", errors="replace") | |
| print(f"Unicode apostrophe decoded with cp1252: {unicode_decoded}") | |
| if unicode_decoded == "It’s": | |
| print("✓ MATCH! Windows-1252 decoding produces exactly 'It’s'") | |
| # For comparison, latin-1 | |
| latin1_decoded = unicode_bytes.decode("latin-1", errors="replace") | |
| print(f"Unicode apostrophe decoded with latin-1: {latin1_decoded}") | |
| if latin1_decoded == "It’s": | |
| print("✓ MATCH! Latin-1 decoding produces exactly 'It’s'") | |
| def check_mac_specific_encodings(): | |
| """Test Mac-specific encodings that might cause the same issue.""" | |
| print("\n=== TESTING MAC-SPECIFIC ENCODINGS ===") | |
| # Original string with Unicode right single quotation mark - explicitly using Unicode escape | |
| original_string = "It\u2019s" | |
| print(f"Testing with original string: {original_string}") | |
| print(f"Unicode codepoint for apostrophe: U+{ord(original_string[2]):04X}") | |
| assert ord(original_string[2]) == 0x2019, "Not using correct Unicode character" | |
| # The UTF-8 encoded bytes for this string | |
| utf8_bytes = original_string.encode("utf-8") | |
| print(f"UTF-8 bytes: {utf8_bytes}") | |
| print(f"UTF-8 byte representation: {[hex(b) for b in utf8_bytes]}") | |
| # Try different encodings that might be used on Mac systems | |
| encodings_to_test = [ | |
| "cp1252", # Windows-1252 | |
| "latin-1", # ISO-8859-1 | |
| "macroman", # Traditional Mac encoding | |
| "ascii", # ASCII with replacement for non-ASCII chars | |
| "utf-16", # UTF-16 (common for OS interfaces) | |
| "utf-16-le", # UTF-16 Little Endian | |
| "utf-16-be", # UTF-16 Big Endian | |
| ] | |
| print("\nEncoding Test Results:") | |
| print("-" * 50) | |
| # Test scenario 1: UTF-8 encoded text decoded incorrectly with other encodings | |
| print("\nScenario 1: UTF-8 encoded text decoded incorrectly:") | |
| for encoding in encodings_to_test: | |
| try: | |
| decoded = utf8_bytes.decode(encoding, errors="replace") | |
| print(f" {encoding:<10}: {decoded}") | |
| if decoded == "It’s": | |
| print(f" ✓ MATCH FOUND! {encoding} produces exactly 'It’s'") | |
| except Exception as e: | |
| print(f" {encoding:<10}: Error - {str(e)}") | |
| # Test scenario 2: Original string encoded with various encodings then decoded as UTF-8 | |
| print("\nScenario 2: Different encodings decoded as UTF-8:") | |
| for encoding in encodings_to_test: | |
| try: | |
| # Some encodings might not support the Unicode apostrophe and will replace it | |
| encoded = original_string.encode(encoding, errors="replace") | |
| # Then decode back as UTF-8 (which might cause issues if UTF-8 is expected) | |
| decoded = encoded.decode("utf-8", errors="replace") | |
| print(f" {encoding:<10}: {decoded}") | |
| if decoded == "It’s": | |
| print(f" ✓ MATCH FOUND! {encoding} produces exactly 'It’s'") | |
| except Exception as e: | |
| print(f" {encoding:<10}: Error - {str(e)}") | |
| # HTML entities - sometimes browsers display Unicode differently | |
| try: | |
| import html | |
| html_encoded = html.escape(original_string) | |
| print(f"\nHTML encoded: {html_encoded}") | |
| except Exception as e: | |
| print(f"HTML test: Error - {str(e)}") | |
| # Test what happens with double encoding/decoding errors | |
| try: | |
| # Step 1: UTF-8 → latin1 (wrong) → UTF-8 (attempt to fix) → latin1 (wrong again) | |
| step1 = utf8_bytes.decode("latin-1") | |
| step2 = step1.encode("utf-8") | |
| step3 = step2.decode("latin-1") | |
| print(f"\nDouble encoding error: {step3}") | |
| if step3 == "It’s": | |
| print(" ✓ MATCH FOUND! Double encoding error produces exactly 'It’s'") | |
| except Exception as e: | |
| print(f"Double encoding test: Error - {str(e)}") | |
| def check_common_web_situations(): | |
| """Test common web/browser encoding scenarios.""" | |
| print("\n=== TESTING COMMON WEB SCENARIOS ===") | |
| # Original string with Unicode right single quotation mark | |
| original_string = "It\u2019s" | |
| print(f"Original string: {original_string}") | |
| assert ord(original_string[2]) == 0x2019, "Not using correct Unicode character" | |
| # Database connection charset issues (e.g., MySQL with wrong connection charset) | |
| print("\nDatabase connection scenarios:") | |
| # MySQL with UTF8 data but latin1 connection | |
| utf8_bytes = original_string.encode("utf-8") | |
| mysql_latin1_connection = utf8_bytes.decode("latin-1") | |
| print(f"MySQL UTF8 data with latin1 connection: {mysql_latin1_connection}") | |
| # MySQL with latin1 data incorrectly treated as UTF8 | |
| try: | |
| latin1_bytes = original_string.encode("latin-1", errors="replace") | |
| mysql_utf8_misconfig = latin1_bytes.decode("utf-8", errors="replace") | |
| print(f"MySQL latin1 data treated as UTF8: {mysql_utf8_misconfig}") | |
| except Exception as e: | |
| print(f"MySQL latin1->UTF8 error: {str(e)}") | |
| # JSON encoding/decoding with wrong charset | |
| print("\nJSON encoding scenarios:") | |
| try: | |
| import json | |
| # Correctly encoded JSON | |
| json_str = json.dumps({"text": original_string}) | |
| print(f"JSON correctly encoded: {json_str}") | |
| # What if we encode the already broken string? | |
| broken = utf8_bytes.decode("cp1252", errors="replace") | |
| json_broken = json.dumps({"text": broken}) | |
| print(f"JSON with already broken text: {json_broken}") | |
| except Exception as e: | |
| print(f"JSON test error: {str(e)}") | |
| # HTTP header encoding issues | |
| print("\nHTTP header scenarios:") | |
| # Content-Type: text/html; charset=latin1 but content is UTF-8 | |
| latin1_interpreted = utf8_bytes.decode("latin-1") | |
| print(f"HTTP UTF-8 content with latin1 charset header: {latin1_interpreted}") | |
| if latin1_interpreted == mysql_latin1_connection: | |
| print("✓ MATCH! Latin-1 and MySQL latin1 produce the same result") | |
| if __name__ == "__main__": | |
| print("Starting encoding tests...") | |
| check_unicode_apostrophe_encoding() | |
| check_different_apostrophe_types() | |
| check_mac_specific_encodings() | |
| check_common_web_situations() | |
| print("\nTests completed.") |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Ad hoc Python script to try to figure out which encoding a client who reported the error was using to see the exact output sent in the screenshot.