Created
May 22, 2020 19:26
-
-
Save zhehaowang/42503c27ba43bc68f0408fd64a596312 to your computer and use it in GitHub Desktop.
utf-8 recovery hack
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import glob | |
import csv | |
from xlsxwriter.workbook import Workbook | |
for csvfile in ["out.csv"]: | |
workbook = Workbook(csvfile[:-4] + '.xlsx') | |
worksheet = workbook.add_worksheet() | |
with open(csvfile, 'r', newline='', encoding='utf-8') as f: | |
reader = csv.reader(f) | |
for r, row in enumerate(reader): | |
for c, col in enumerate(row): | |
worksheet.write(r, c, col) | |
workbook.close() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
def should_flip(ints): | |
characters = "".join([chr(c) for c in ints]) | |
if len(characters) == 3: | |
# if characters in ["e94", "d8\n", "f57", "h57", "g,,", "e92", "e72", "e96", "h49", "g\n6"]: | |
# return True | |
if re.match(r"^[edfgh][,\n0-9][,\n0-9]$", characters): | |
return True | |
# if re.match(r"^_[0-9][2b]$", characters): | |
# return True | |
all_alphanum = all([chr(c).isalnum() or chr(c) in ['\n', ',', '_'] for c in ints]) | |
if all_alphanum: | |
return False | |
later_bytes_match = all([(0b01000000 & i) == 0 for i in ints[1:]]) | |
# turns out chinese utf-8 are all 3 chars? | |
if len(ints) == 4: | |
return (0b11111000 & ints[0] == 0b01110000) and later_bytes_match | |
elif len(ints) == 3: | |
return (0b11110000 & ints[0] == 0b01100000) and later_bytes_match | |
# elif len(ints) == 2: | |
# return (0b11100000 & ints[0] == 0b01000000) and later_bytes_match | |
else: | |
return False | |
# c = chr(ints[0]) | |
# return not (c.isalnum() or c in ['\n', ',', '_']) | |
with open('/Users/zwang/Downloads/Shanghai_adminLand_200sample_for_Weiping.csv', 'rb') as f, open('out.csv', 'wb') as outfile: | |
# reader = csv.reader(f) | |
content = f.read() | |
i = 0 | |
while i < len(content): | |
flipped = False | |
for j in [3]: | |
if should_flip(content[i:i + j]): | |
seg = content[i:i + j] | |
for k in range(len(seg)): | |
outfile.write((seg[k] + 128).to_bytes(1, byteorder='little')) | |
i += j | |
flipped = True | |
break | |
if flipped: | |
continue | |
else: | |
outfile.write(content[i].to_bytes(1, byteorder='little')) | |
i += 1 | |
# c = chr(content[i]) | |
# num = content[i] | |
# if c.isalnum() or c in ['\n', ',', '_']: | |
# pass | |
# else: | |
# num += 128 | |
# outfile.write(num.to_bytes(1, byteorder='little')) | |
# i += 1 | |
# with open('/Users/zwang/Downloads/Shanghai_adminLand_200sample_for_Weiping.csv', 'rb') as f: | |
# # reader = csv.reader(f) | |
# content = f.read() | |
# # for c in content: | |
# # print(c), | |
# utf8str = content.decode("utf-8") | |
# line = utf8str.split('\n')[1:5] | |
# print("".join(line).encode("utf-8").hex()) | |
# # part = line.split(',')[1] | |
# # partbin = part.encode("utf-8") | |
# # print(partbin.hex()) | |
# # print(u"f\x17%".encode("utf-8")) | |
# # print(u"f\x17%".encode("utf-8").hex()) | |
# # print(u"f\x17".encode("utf-8").decode("")) | |
# # print(u"年 月 日".encode("gb2312")) | |
# # print(u"年 月 日".encode("utf-8")) | |
# # 2016 e94 9 f\x1c\x08 13 f\x17% | |
# actual unicode: \xe5\xb9\xb4 \xe6\x9c\x88 \xe6\x97\xa5 | |
# given: \x65\x3e\x34 \x66\x1c\x08 \x66\x17\x25 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment