hugocbp · March 13, 2025 22:47 · hugocbp · Mar 13, 2025
diff --git a/encoding_error_investigation.py b/encoding_error_investigation.py
 #!/usr/bin/env python3
 """
 Standalone script to test Unicode apostrophe encoding issues.
 This reproduces the "It's" → "Itâ€™s" encoding issue and tests various encodings.

 Run directly with: python encoding_check.py
 """


 def check_unicode_apostrophe_encoding():
    """Test how encoding issues transform "It's" into "Itâ€™s"."""
    print("\n=== BASIC APOSTROPHE ENCODING TEST ===")

    # Original string with Unicode right single quotation mark (U+2019)
    # Using explicit Unicode escape to ensure we have the right character
    original_string = "It’s"
    assert original_string == "It\u2019s"

    print(f"Original string: {original_string}")
    print(f"Unicode codepoint for apostrophe: U+{ord(original_string[2]):04X}")
    assert ord(original_string[2]) == 0x2019, (
        f"Not using U+2019, got: {hex(ord(original_string[2]))}"
    )

    # Step 1: Encode the string as UTF-8
    utf8_bytes = original_string.encode("utf-8")

    print(f"UTF-8 bytes: {utf8_bytes}")
    print(f"UTF-8 byte representation: {[hex(b) for b in utf8_bytes]}")

    # Step 2: Decode the UTF-8 bytes incorrectly as Windows-1252
    windows1252_decoded = utf8_bytes.decode("cp1252", errors="replace")

    print(f"Windows-1252 decoded: {windows1252_decoded}")

    # Check if it matches the expected output
    if windows1252_decoded == "Itâ€™s":
        print("✓ MATCH! Windows-1252 decoding produces exactly 'Itâ€™s'")
    else:
        print(f"✗ NO MATCH. Expected 'Itâ€™s' but got '{windows1252_decoded}'")

    # Step 3: Try to fix it by re-encoding with Windows-1252 and decoding with UTF-8
    fixed_string = windows1252_decoded.encode("cp1252").decode(
        "utf-8", errors="replace"
    )
    print(f"Fixed string: {fixed_string}")

    if fixed_string == original_string:
        print("✓ Successfully fixed back to original string")
    else:
        print(f"✗ Could not fix back exactly. Got: '{fixed_string}'")


 def check_different_apostrophe_types():
    """Compare ASCII vs Unicode apostrophes."""
    print("\n=== COMPARING DIFFERENT APOSTROPHE TYPES ===")

    # Regular ASCII apostrophe
    ascii_apostrophe = "It's"  # ASCII apostrophe (U+0027)
    print(f"ASCII apostrophe: {ascii_apostrophe}")
    print(f"ASCII apostrophe codepoint: U+{ord(ascii_apostrophe[2]):04X}")
    assert ord(ascii_apostrophe[2]) == 0x0027, "Not ASCII apostrophe"

    ascii_bytes = ascii_apostrophe.encode("utf-8")
    print(f"ASCII UTF-8 bytes: {[hex(b) for b in ascii_bytes]}")

    # Decode with cp1252 - shouldn't change
    ascii_decoded = ascii_bytes.decode("cp1252")
    print(f"ASCII apostrophe decoded with cp1252: {ascii_decoded}")

    # Unicode right single quotation mark
    unicode_apostrophe = "It\u2019s"  # Unicode right single quote (U+2019)
    print(f"\nUnicode apostrophe: {unicode_apostrophe}")
    print(f"Unicode apostrophe codepoint: U+{ord(unicode_apostrophe[2]):04X}")
    assert ord(unicode_apostrophe[2]) == 0x2019, (
        "Not Unicode Right Single Quotation Mark"
    )

    unicode_bytes = unicode_apostrophe.encode("utf-8")
    print(f"Unicode UTF-8 bytes: {[hex(b) for b in unicode_bytes]}")

    # Decode with cp1252 - should produce the issue
    unicode_decoded = unicode_bytes.decode("cp1252", errors="replace")
    print(f"Unicode apostrophe decoded with cp1252: {unicode_decoded}")

    if unicode_decoded == "Itâ€™s":
        print("✓ MATCH! Windows-1252 decoding produces exactly 'Itâ€™s'")

    # For comparison, latin-1
    latin1_decoded = unicode_bytes.decode("latin-1", errors="replace")
    print(f"Unicode apostrophe decoded with latin-1: {latin1_decoded}")

    if latin1_decoded == "Itâ€™s":
        print("✓ MATCH! Latin-1 decoding produces exactly 'Itâ€™s'")


 def check_mac_specific_encodings():
    """Test Mac-specific encodings that might cause the same issue."""
    print("\n=== TESTING MAC-SPECIFIC ENCODINGS ===")

    # Original string with Unicode right single quotation mark - explicitly using Unicode escape
    original_string = "It\u2019s"
    print(f"Testing with original string: {original_string}")
    print(f"Unicode codepoint for apostrophe: U+{ord(original_string[2]):04X}")
    assert ord(original_string[2]) == 0x2019, "Not using correct Unicode character"

    # The UTF-8 encoded bytes for this string
    utf8_bytes = original_string.encode("utf-8")
    print(f"UTF-8 bytes: {utf8_bytes}")
    print(f"UTF-8 byte representation: {[hex(b) for b in utf8_bytes]}")

    # Try different encodings that might be used on Mac systems
    encodings_to_test = [
        "cp1252",  # Windows-1252
        "latin-1",  # ISO-8859-1
        "macroman",  # Traditional Mac encoding
        "ascii",  # ASCII with replacement for non-ASCII chars
        "utf-16",  # UTF-16 (common for OS interfaces)
        "utf-16-le",  # UTF-16 Little Endian
        "utf-16-be",  # UTF-16 Big Endian
    ]

    print("\nEncoding Test Results:")
    print("-" * 50)

    # Test scenario 1: UTF-8 encoded text decoded incorrectly with other encodings
    print("\nScenario 1: UTF-8 encoded text decoded incorrectly:")
    for encoding in encodings_to_test:
        try:
            decoded = utf8_bytes.decode(encoding, errors="replace")
            print(f"  {encoding:<10}: {decoded}")
            if decoded == "Itâ€™s":
                print(f"    ✓ MATCH FOUND! {encoding} produces exactly 'Itâ€™s'")
        except Exception as e:
            print(f"  {encoding:<10}: Error - {str(e)}")

    # Test scenario 2: Original string encoded with various encodings then decoded as UTF-8
    print("\nScenario 2: Different encodings decoded as UTF-8:")
    for encoding in encodings_to_test:
        try:
            # Some encodings might not support the Unicode apostrophe and will replace it
            encoded = original_string.encode(encoding, errors="replace")
            # Then decode back as UTF-8 (which might cause issues if UTF-8 is expected)
            decoded = encoded.decode("utf-8", errors="replace")
            print(f"  {encoding:<10}: {decoded}")
            if decoded == "Itâ€™s":
                print(f"    ✓ MATCH FOUND! {encoding} produces exactly 'Itâ€™s'")
        except Exception as e:
            print(f"  {encoding:<10}: Error - {str(e)}")

    # HTML entities - sometimes browsers display Unicode differently
    try:
        import html

        html_encoded = html.escape(original_string)
        print(f"\nHTML encoded: {html_encoded}")
    except Exception as e:
        print(f"HTML test: Error - {str(e)}")

    # Test what happens with double encoding/decoding errors
    try:
        # Step 1: UTF-8 → latin1 (wrong) → UTF-8 (attempt to fix) → latin1 (wrong again)
        step1 = utf8_bytes.decode("latin-1")
        step2 = step1.encode("utf-8")
        step3 = step2.decode("latin-1")
        print(f"\nDouble encoding error: {step3}")
        if step3 == "Itâ€™s":
            print("    ✓ MATCH FOUND! Double encoding error produces exactly 'Itâ€™s'")
    except Exception as e:
        print(f"Double encoding test: Error - {str(e)}")


 def check_common_web_situations():
    """Test common web/browser encoding scenarios."""
    print("\n=== TESTING COMMON WEB SCENARIOS ===")

    # Original string with Unicode right single quotation mark
    original_string = "It\u2019s"
    print(f"Original string: {original_string}")
    assert ord(original_string[2]) == 0x2019, "Not using correct Unicode character"

    # Database connection charset issues (e.g., MySQL with wrong connection charset)
    print("\nDatabase connection scenarios:")

    # MySQL with UTF8 data but latin1 connection
    utf8_bytes = original_string.encode("utf-8")
    mysql_latin1_connection = utf8_bytes.decode("latin-1")
    print(f"MySQL UTF8 data with latin1 connection: {mysql_latin1_connection}")

    # MySQL with latin1 data incorrectly treated as UTF8
    try:
        latin1_bytes = original_string.encode("latin-1", errors="replace")
        mysql_utf8_misconfig = latin1_bytes.decode("utf-8", errors="replace")
        print(f"MySQL latin1 data treated as UTF8: {mysql_utf8_misconfig}")
    except Exception as e:
        print(f"MySQL latin1->UTF8 error: {str(e)}")

    # JSON encoding/decoding with wrong charset
    print("\nJSON encoding scenarios:")
    try:
        import json

        # Correctly encoded JSON
        json_str = json.dumps({"text": original_string})
        print(f"JSON correctly encoded: {json_str}")

        # What if we encode the already broken string?
        broken = utf8_bytes.decode("cp1252", errors="replace")
        json_broken = json.dumps({"text": broken})
        print(f"JSON with already broken text: {json_broken}")
    except Exception as e:
        print(f"JSON test error: {str(e)}")

    # HTTP header encoding issues
    print("\nHTTP header scenarios:")
    # Content-Type: text/html; charset=latin1 but content is UTF-8
    latin1_interpreted = utf8_bytes.decode("latin-1")
    print(f"HTTP UTF-8 content with latin1 charset header: {latin1_interpreted}")

    if latin1_interpreted == mysql_latin1_connection:
        print("✓ MATCH! Latin-1 and MySQL latin1 produce the same result")


 if __name__ == "__main__":
    print("Starting encoding tests...")
    check_unicode_apostrophe_encoding()
    check_different_apostrophe_types()
    check_mac_specific_encodings()
    check_common_web_situations()
    print("\nTests completed.")
	#!/usr/bin/env python3
	"""
	Standalone script to test Unicode apostrophe encoding issues.
	This reproduces the "It's" → "Itâ€™s" encoding issue and tests various encodings.

	Run directly with: python encoding_check.py
	"""


	def check_unicode_apostrophe_encoding():
	"""Test how encoding issues transform "It's" into "Itâ€™s"."""
	print("\n=== BASIC APOSTROPHE ENCODING TEST ===")

	# Original string with Unicode right single quotation mark (U+2019)
	# Using explicit Unicode escape to ensure we have the right character
	original_string = "It’s"
	assert original_string == "It\u2019s"

	print(f"Original string: {original_string}")
	print(f"Unicode codepoint for apostrophe: U+{ord(original_string[2]):04X}")
	assert ord(original_string[2]) == 0x2019, (
	f"Not using U+2019, got: {hex(ord(original_string[2]))}"
	)

	# Step 1: Encode the string as UTF-8
	utf8_bytes = original_string.encode("utf-8")

	print(f"UTF-8 bytes: {utf8_bytes}")
	print(f"UTF-8 byte representation: {[hex(b) for b in utf8_bytes]}")

	# Step 2: Decode the UTF-8 bytes incorrectly as Windows-1252
	windows1252_decoded = utf8_bytes.decode("cp1252", errors="replace")

	print(f"Windows-1252 decoded: {windows1252_decoded}")

	# Check if it matches the expected output
	if windows1252_decoded == "Itâ€™s":
	print("✓ MATCH! Windows-1252 decoding produces exactly 'Itâ€™s'")
	else:
	print(f"✗ NO MATCH. Expected 'Itâ€™s' but got '{windows1252_decoded}'")

	# Step 3: Try to fix it by re-encoding with Windows-1252 and decoding with UTF-8
	fixed_string = windows1252_decoded.encode("cp1252").decode(
	"utf-8", errors="replace"
	)
	print(f"Fixed string: {fixed_string}")

	if fixed_string == original_string:
	print("✓ Successfully fixed back to original string")
	else:
	print(f"✗ Could not fix back exactly. Got: '{fixed_string}'")


	def check_different_apostrophe_types():
	"""Compare ASCII vs Unicode apostrophes."""
	print("\n=== COMPARING DIFFERENT APOSTROPHE TYPES ===")

	# Regular ASCII apostrophe
	ascii_apostrophe = "It's" # ASCII apostrophe (U+0027)
	print(f"ASCII apostrophe: {ascii_apostrophe}")
	print(f"ASCII apostrophe codepoint: U+{ord(ascii_apostrophe[2]):04X}")
	assert ord(ascii_apostrophe[2]) == 0x0027, "Not ASCII apostrophe"

	ascii_bytes = ascii_apostrophe.encode("utf-8")
	print(f"ASCII UTF-8 bytes: {[hex(b) for b in ascii_bytes]}")

	# Decode with cp1252 - shouldn't change
	ascii_decoded = ascii_bytes.decode("cp1252")
	print(f"ASCII apostrophe decoded with cp1252: {ascii_decoded}")

	# Unicode right single quotation mark
	unicode_apostrophe = "It\u2019s" # Unicode right single quote (U+2019)
	print(f"\nUnicode apostrophe: {unicode_apostrophe}")
	print(f"Unicode apostrophe codepoint: U+{ord(unicode_apostrophe[2]):04X}")
	assert ord(unicode_apostrophe[2]) == 0x2019, (
	"Not Unicode Right Single Quotation Mark"
	)

	unicode_bytes = unicode_apostrophe.encode("utf-8")
	print(f"Unicode UTF-8 bytes: {[hex(b) for b in unicode_bytes]}")

	# Decode with cp1252 - should produce the issue
	unicode_decoded = unicode_bytes.decode("cp1252", errors="replace")
	print(f"Unicode apostrophe decoded with cp1252: {unicode_decoded}")

	if unicode_decoded == "Itâ€™s":
	print("✓ MATCH! Windows-1252 decoding produces exactly 'Itâ€™s'")

	# For comparison, latin-1
	latin1_decoded = unicode_bytes.decode("latin-1", errors="replace")
	print(f"Unicode apostrophe decoded with latin-1: {latin1_decoded}")

	if latin1_decoded == "Itâ€™s":
	print("✓ MATCH! Latin-1 decoding produces exactly 'Itâ€™s'")


	def check_mac_specific_encodings():
	"""Test Mac-specific encodings that might cause the same issue."""
	print("\n=== TESTING MAC-SPECIFIC ENCODINGS ===")

	# Original string with Unicode right single quotation mark - explicitly using Unicode escape
	original_string = "It\u2019s"
	print(f"Testing with original string: {original_string}")
	print(f"Unicode codepoint for apostrophe: U+{ord(original_string[2]):04X}")
	assert ord(original_string[2]) == 0x2019, "Not using correct Unicode character"

	# The UTF-8 encoded bytes for this string
	utf8_bytes = original_string.encode("utf-8")
	print(f"UTF-8 bytes: {utf8_bytes}")
	print(f"UTF-8 byte representation: {[hex(b) for b in utf8_bytes]}")

	# Try different encodings that might be used on Mac systems
	encodings_to_test = [
	"cp1252", # Windows-1252
	"latin-1", # ISO-8859-1
	"macroman", # Traditional Mac encoding
	"ascii", # ASCII with replacement for non-ASCII chars
	"utf-16", # UTF-16 (common for OS interfaces)
	"utf-16-le", # UTF-16 Little Endian
	"utf-16-be", # UTF-16 Big Endian
	]

	print("\nEncoding Test Results:")
	print("-" * 50)

	# Test scenario 1: UTF-8 encoded text decoded incorrectly with other encodings
	print("\nScenario 1: UTF-8 encoded text decoded incorrectly:")
	for encoding in encodings_to_test:
	try:
	decoded = utf8_bytes.decode(encoding, errors="replace")
	print(f" {encoding:<10}: {decoded}")
	if decoded == "Itâ€™s":
	print(f" ✓ MATCH FOUND! {encoding} produces exactly 'Itâ€™s'")
	except Exception as e:
	print(f" {encoding:<10}: Error - {str(e)}")

	# Test scenario 2: Original string encoded with various encodings then decoded as UTF-8
	print("\nScenario 2: Different encodings decoded as UTF-8:")
	for encoding in encodings_to_test:
	try:
	# Some encodings might not support the Unicode apostrophe and will replace it
	encoded = original_string.encode(encoding, errors="replace")
	# Then decode back as UTF-8 (which might cause issues if UTF-8 is expected)
	decoded = encoded.decode("utf-8", errors="replace")
	print(f" {encoding:<10}: {decoded}")
	if decoded == "Itâ€™s":
	print(f" ✓ MATCH FOUND! {encoding} produces exactly 'Itâ€™s'")
	except Exception as e:
	print(f" {encoding:<10}: Error - {str(e)}")

	# HTML entities - sometimes browsers display Unicode differently
	try:
	import html

	html_encoded = html.escape(original_string)
	print(f"\nHTML encoded: {html_encoded}")
	except Exception as e:
	print(f"HTML test: Error - {str(e)}")

	# Test what happens with double encoding/decoding errors
	try:
	# Step 1: UTF-8 → latin1 (wrong) → UTF-8 (attempt to fix) → latin1 (wrong again)
	step1 = utf8_bytes.decode("latin-1")
	step2 = step1.encode("utf-8")
	step3 = step2.decode("latin-1")
	print(f"\nDouble encoding error: {step3}")
	if step3 == "Itâ€™s":
	print(" ✓ MATCH FOUND! Double encoding error produces exactly 'Itâ€™s'")
	except Exception as e:
	print(f"Double encoding test: Error - {str(e)}")


	def check_common_web_situations():
	"""Test common web/browser encoding scenarios."""
	print("\n=== TESTING COMMON WEB SCENARIOS ===")

	# Original string with Unicode right single quotation mark
	original_string = "It\u2019s"
	print(f"Original string: {original_string}")
	assert ord(original_string[2]) == 0x2019, "Not using correct Unicode character"

	# Database connection charset issues (e.g., MySQL with wrong connection charset)
	print("\nDatabase connection scenarios:")

	# MySQL with UTF8 data but latin1 connection
	utf8_bytes = original_string.encode("utf-8")
	mysql_latin1_connection = utf8_bytes.decode("latin-1")
	print(f"MySQL UTF8 data with latin1 connection: {mysql_latin1_connection}")

	# MySQL with latin1 data incorrectly treated as UTF8
	try:
	latin1_bytes = original_string.encode("latin-1", errors="replace")
	mysql_utf8_misconfig = latin1_bytes.decode("utf-8", errors="replace")
	print(f"MySQL latin1 data treated as UTF8: {mysql_utf8_misconfig}")
	except Exception as e:
	print(f"MySQL latin1->UTF8 error: {str(e)}")

	# JSON encoding/decoding with wrong charset
	print("\nJSON encoding scenarios:")
	try:
	import json

	# Correctly encoded JSON
	json_str = json.dumps({"text": original_string})
	print(f"JSON correctly encoded: {json_str}")

	# What if we encode the already broken string?
	broken = utf8_bytes.decode("cp1252", errors="replace")
	json_broken = json.dumps({"text": broken})
	print(f"JSON with already broken text: {json_broken}")
	except Exception as e:
	print(f"JSON test error: {str(e)}")

	# HTTP header encoding issues
	print("\nHTTP header scenarios:")
	# Content-Type: text/html; charset=latin1 but content is UTF-8
	latin1_interpreted = utf8_bytes.decode("latin-1")
	print(f"HTTP UTF-8 content with latin1 charset header: {latin1_interpreted}")

	if latin1_interpreted == mysql_latin1_connection:
	print("✓ MATCH! Latin-1 and MySQL latin1 produce the same result")


	if __name__ == "__main__":
	print("Starting encoding tests...")
	check_unicode_apostrophe_encoding()
	check_different_apostrophe_types()
	check_mac_specific_encodings()
	check_common_web_situations()
	print("\nTests completed.")
No results found