Last active
January 9, 2018 02:40
-
-
Save pnck/d7072b05a3bced13adf0e09044013df0 to your computer and use it in GitHub Desktop.
Chinese encodeing problem one shot
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
#encoding:utf8 | |
# NOTE: only sovle problems with text file that has been aready encoded into utf8 | |
# NOTE: may lead to wrong result if the file contains nothing corrupted | |
import sys | |
way1 = lambda s: s.encode('latin1').decode('gbk') | |
way2 = lambda s: s.encode('gbk').decode('utf8').encode('latin1').decode('gbk') | |
for fname in sys.argv[1:]: | |
with open(fname) as file: | |
data = file.read() | |
s_save = '' | |
try: | |
s_save = way1(data) | |
except UnicodeError: | |
try: | |
s_save = way2(data) | |
except UnicodeError: | |
pass | |
if s_save: | |
open(fname+'.tryrecover.txt','wb').write(s_save.encode()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment