Skip to content

Instantly share code, notes, and snippets.

@FRex
Created March 17, 2025 22:35
Show Gist options
  • Save FRex/c1f9252cd7b2030fe97f75b38625194e to your computer and use it in GitHub Desktop.
Save FRex/c1f9252cd7b2030fe97f75b38625194e to your computer and use it in GitHub Desktop.
"""Detect codepage that UTF-8 data was misinterpreted as."""
# NOTE: list made using Tools/unicode/listcodecs.py
all_codecs = [
"ascii",
"base64_codec",
"big5",
"big5hkscs",
"bz2_codec",
"charmap",
"cp037",
"cp1006",
"cp1026",
"cp1125",
"cp1140",
"cp1250",
"cp1251",
"cp1252",
"cp1253",
"cp1254",
"cp1255",
"cp1256",
"cp1257",
"cp1258",
"cp273",
"cp424",
"cp437",
"cp500",
"cp720",
"cp737",
"cp775",
"cp850",
"cp852",
"cp855",
"cp856",
"cp857",
"cp858",
"cp860",
"cp861",
"cp862",
"cp863",
"cp864",
"cp865",
"cp866",
"cp869",
"cp874",
"cp875",
"cp932",
"cp949",
"cp950",
"euc_jis_2004",
"euc_jisx0213",
"euc_jp",
"euc_kr",
"gb18030",
"gb2312",
"gbk",
"hex_codec",
"hp_roman8",
"hz",
"idna",
"iso2022_jp",
"iso2022_jp_1",
"iso2022_jp_2",
"iso2022_jp_2004",
"iso2022_jp_3",
"iso2022_jp_ext",
"iso2022_kr",
"iso8859_1",
"iso8859_10",
"iso8859_11",
"iso8859_13",
"iso8859_14",
"iso8859_15",
"iso8859_16",
"iso8859_2",
"iso8859_3",
"iso8859_4",
"iso8859_5",
"iso8859_6",
"iso8859_7",
"iso8859_8",
"iso8859_9",
"johab",
"koi8_r",
"koi8_t",
"koi8_u",
"kz1048",
"latin_1",
"mac_arabic",
"mac_croatian",
"mac_cyrillic",
"mac_farsi",
"mac_greek",
"mac_iceland",
"mac_latin2",
"mac_roman",
"mac_romanian",
"mac_turkish",
"mbcs",
"oem",
"palmos",
"ptcp154",
"punycode",
"quopri_codec",
"raw_unicode_escape",
"rot_13",
"shift_jis",
"shift_jis_2004",
"shift_jisx0213",
"tis_620",
"undefined",
"unicode_escape",
"utf_16",
"utf_16_be",
"utf_16_le",
"utf_32",
"utf_32_be",
"utf_32_le",
"utf_7",
"utf_8",
"utf_8_sig",
"uu_codec",
"zlib_codec",
]
def detect_miscode(have: str, want: str) -> list:
"""Find all codes for which have.encode(c).decode("UTF-8") equals want."""
ret = []
for c in all_codecs:
try:
recoded = have.encode(c).decode("UTF-8")
if recoded == want:
ret.append(c)
except (RuntimeError, UnicodeError, LookupError):
pass
return ret
def main():
"""Main function."""
have = "👋 Hello, World! ♥"
want = "👋 Hello, World! ♥"
possible = detect_miscode(have, want)
print(f"Possible codes: {possible}")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment