Created
July 22, 2016 17:31
-
-
Save jhyland87/2bac8523407dbf9c922bcc7cb83fa954 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import fileinput | |
import sys | |
import string | |
reload(sys) | |
sys.setdefaultencoding("utf-8") | |
def sanitizeData( data ): | |
# This might be useful for later | |
#if not isinstance( dirty_string, unicode): | |
# return data | |
def _sanatizeStr( dirty_string ): | |
dirty_string = str( dirty_string ) | |
unsupported_str = 'BADSTRING' | |
replacement_chars = { | |
u'\u201C': '"', # RIGHT DOUBLE QUOTATION MARK | |
u'\u201D': '"', # RIGHT DOUBLE QUOTATION MARK | |
u'\u2018': "'", # LEFT SINGLE QUOTATION MARK | |
u'\u2019': "'", # RIGHT SINGLE QUOTATION MARK | |
u'\u2014': "-", # EM DASH | |
u'\u2013': "-", # EM DASH | |
u'\u02DC': "~", # SMALL TILDE | |
u'\u201A': "'", # SINGLE LOW-9 QUOTATION MARK | |
u'\u201E': '"', # DOUBLE LOW-9 QUOTATION MARK | |
u'\u00A0': " ", # NON-BREAKING SPACE BAR | |
u'\u2011': "-", # NON-BREAKING HYPHEN | |
u'\u2022': '-', # NON-BREAKING HYPHEN | |
u'\u20AC': unsupported_str, # EURO SIGN | |
u'\u0192': unsupported_str, # LATIN SMALL LETTER F WITH HOOK | |
u'\u2026': unsupported_str, # HORIZONTAL ELLIPSIS | |
u'\u2020': unsupported_str, # DAGGER | |
u'\u2021': unsupported_str, # DOUBLE DAGGER | |
u'\u02C6': unsupported_str, # MODIFIER LETTER CIRCUMFLEX ACCENT | |
u'\u2030': unsupported_str, # PER MILLE SIGN | |
u'\u0160': unsupported_str, # LATIN CAPITAL LETTER S WITH CARON | |
u'\u2039': unsupported_str, # SINGLE LEFT-POINTING ANGLE QUOTATION MARK | |
u'\u0152': unsupported_str, # LATIN CAPITAL LIGATURE OE | |
u'\u017D': unsupported_str, # LATIN CAPITAL LETTER Z WITH CARON | |
u'\u2122': unsupported_str, # TRADE MARK SIGN | |
u'\u0161': unsupported_str, # LATIN SMALL LETTER S WITH CARON | |
u'\u203A': unsupported_str, # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK | |
u'\u0153': unsupported_str, # LATIN SMALL LIGATURE OE | |
u'\u017E': unsupported_str, # LATIN SMALL LETTER Z WITH CARON | |
u'\u0178': unsupported_str, # LATIN CAPITAL LETTER Y WITH DIAERESIS | |
u'\u0420': unsupported_str, | |
u'\u043E': unsupported_str, | |
u'\u0441': unsupported_str, | |
u'\u0438': unsupported_str, | |
u'\u044F': unsupported_str, | |
u'\u0103': unsupported_str | |
} | |
dirty_string = dirty_string.translate(None, string.punctuation) | |
# Loop through the replacement characters, replacing key with the value in the dirty_string | |
for k, v in replacement_chars.iteritems(): | |
#print("Replacing %s -> %s" % (k, v)) | |
pre_repl = dirty_string | |
dirty_string = dirty_string.replace(k, v) | |
# If there WAS an update when replacing this character, and the character is an 'unsupported' character, then throw a hissy-fit | |
if str(pre_repl) != str(dirty_string) and v == unsupported_str: | |
#if pre_repl != dirty_string and v == unsupported_str: | |
print "BAD STRING FOUND" | |
exit() | |
return str(dirty_string) | |
def _sanatizeList( listData ): | |
for subkey, subvalue in enumerate(listData): | |
if isinstance( subvalue, str ): | |
listData[subkey] = _sanatizeStr( subvalue ) | |
else: | |
listData[subkey] = sanitizeData( subvalue ) | |
return listData | |
def _sanatizeDict( dictData ): | |
for key, value in dictData.items(): | |
if isinstance( value, str ): | |
dictData[key] = _sanatizeStr( value ) | |
else: | |
dictData[key] = sanitizeData( value ) | |
return dictData | |
if isinstance( data, list ): | |
return _sanatizeList( data ) | |
if isinstance( data, dict ): | |
return _sanatizeDict( data ) | |
# Commenting this out, because sometimes the type is 'unicode', so isinstance for str fails | |
#if isinstance( data, str ): | |
# return _sanatizeStr( data ) | |
return _sanatizeStr( data ) | |
dirty_string = raw_input("Enter Dirty String: ") # Python 2.x | |
clean_string = sanitizeData(dirty_string) | |
print '\nRESULTS' | |
print '{0:<25}: {1:<25}'.format( 'Original (Dirty)', dirty_string ) | |
print 'Type: %s' % type(dirty_string) | |
print '{0:<25}: {1:<25}'.format( 'Modified (Sanitized)', clean_string ) | |
print 'Type: %s' % type(clean_string) | |
# EXAMPLE RESULTS: | |
# Wayne’s Candies -> Wayne's Candies | |
# Quotes: ‘ ’ “ ” -> ' ' " " | |
# Euro: €123 -> BAD STRING FOUND |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment