Created
June 21, 2016 22:56
-
-
Save jhyland87/5b4a3b1bbca473ad09a8da9a5bdadd54 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import fileinput | |
import sys | |
reload(sys) | |
sys.setdefaultencoding("utf-8") | |
name = raw_input("Enter your name: ") # Python 2.x | |
#ascii = name.decode('unicode_escape').encode('ascii','ignore') | |
#utf8 = name.decode('unicode_escape').encode('utf8','ignore') | |
def replace_chars(string): | |
chars = { | |
'\xc2\x82' : ',', # High code comma | |
'\xc2\x84' : ',,', # High code double comma | |
'\xc2\x85' : '...', # Tripple dot | |
'\xc2\x88' : '^', # High carat | |
'\xc2\x91' : '\'', # Forward single quote | |
'\xc2\x92' : '\'', # Reverse single quote | |
'\xc2\x93' : '"', # Forward double quote | |
'\xc2\x94' : '"', # Reverse double quote | |
'\xc2\x95' : ' ', | |
'\xc2\x96' : '-', # High hyphen | |
'\xc2\x97' : '--', # Double hyphen | |
'\xc2\x99' : ' ', | |
'\xc2\xa0' : ' ', | |
'\xc2\xa6' : '|', # Split vertical bar | |
'\xc2\xab' : '<<', # Double less than | |
'\xc2\xbb' : '>>', # Double greater than | |
'\xc2\xbc' : '1/4', # one quarter | |
'\xc2\xbd' : '1/2', # one half | |
'\xc2\xbe' : '3/4', # three quarters | |
'\xca\xbf' : '\x27', # c-single quote | |
'\xcc\xa8' : '', # modifier - under curve | |
'\xcc\xb1' : '' # modifier - under line | |
} | |
for k, v in chars.iteritems(): | |
string = string.replace(k, v) | |
return string | |
def replace_chars1(text): | |
while True: | |
match = POSSIBLE_UTF8_SEQUENCE.search(text) | |
if match: | |
fixed = match.group(1).encode('latin-1').decode('utf-8') | |
text = text[:match.start()] + fixed + text[match.end():] | |
else: | |
return text | |
cp1252 = { | |
# from http://www.microsoft.com/typography/unicode/1252.htm | |
u"\x80": u"\u20AC", # EURO SIGN | |
u"\x82": u"\u201A", # SINGLE LOW-9 QUOTATION MARK | |
u"\x83": u"\u0192", # LATIN SMALL LETTER F WITH HOOK | |
u"\x84": u"\u201E", # DOUBLE LOW-9 QUOTATION MARK | |
u"\x85": u"\u2026", # HORIZONTAL ELLIPSIS | |
u"\x86": u"\u2020", # DAGGER | |
u"\x87": u"\u2021", # DOUBLE DAGGER | |
u"\x88": u"\u02C6", # MODIFIER LETTER CIRCUMFLEX ACCENT | |
u"\x89": u"\u2030", # PER MILLE SIGN | |
u"\x8A": u"\u0160", # LATIN CAPITAL LETTER S WITH CARON | |
u"\x8B": u"\u2039", # SINGLE LEFT-POINTING ANGLE QUOTATION MARK | |
u"\x8C": u"\u0152", # LATIN CAPITAL LIGATURE OE | |
u"\x8E": u"\u017D", # LATIN CAPITAL LETTER Z WITH CARON | |
u"\x91": u"\u2018", # LEFT SINGLE QUOTATION MARK | |
u"\x92": u"\u2019", # RIGHT SINGLE QUOTATION MARK | |
u"\x93": u"\u201C", # LEFT DOUBLE QUOTATION MARK | |
u"\x94": u"\u201D", # RIGHT DOUBLE QUOTATION MARK | |
u"\x95": u"\u2022", # BULLET | |
u"\x96": u"\u2013", # EN DASH | |
u"\x97": u"\u2014", # EM DASH | |
u"\x98": u"\u02DC", # SMALL TILDE | |
u"\x99": u"\u2122", # TRADE MARK SIGN | |
u"\x9A": u"\u0161", # LATIN SMALL LETTER S WITH CARON | |
u"\x9B": u"\u203A", # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK | |
u"\x9C": u"\u0153", # LATIN SMALL LIGATURE OE | |
u"\x9E": u"\u017E", # LATIN SMALL LETTER Z WITH CARON | |
u"\x9F": u"\u0178", # LATIN CAPITAL LETTER Y WITH DIAERESIS | |
} | |
import re | |
def kill_gremlins(text): | |
# map cp1252 gremlins to real unicode characters | |
if re.search(u"[\x80-\x9f]", text): | |
def fixup(m): | |
s = m.group(0) | |
return cp1252.get(s, s) | |
if isinstance(text, type("")): | |
# make sure we have a unicode string | |
text = unicode(text, "iso-8859-1") | |
text = re.sub(u"[\x80-\x9f]", fixup, text) | |
return text | |
a = name.decode('unicode_escape').encode('ascii', 'replace') | |
b = name.decode('utf-8').encode('ascii', 'replace') | |
c = name.encode("ascii", "replace") | |
d = replace_chars(name) | |
print('Original: %s\nA: %s\nB: %s\nC: %s\nD: %s' % (name, a, b, c, d)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment