jhyland87 · July 22, 2016 17:31
diff --git a/sanitize.py b/sanitize.py
 import fileinput
 import sys
 import string
 reload(sys)
 sys.setdefaultencoding("utf-8")

 def sanitizeData( data ):
 # This might be useful for later
    #if not isinstance( dirty_string, unicode):
    #    return data

    def _sanatizeStr( dirty_string ):
        dirty_string = str( dirty_string )

        unsupported_str = 'BADSTRING'

        replacement_chars = {
            u'\u201C': '"',             # RIGHT DOUBLE QUOTATION MARK
            u'\u201D': '"',             # RIGHT DOUBLE QUOTATION MARK
            u'\u2018': "'",             # LEFT SINGLE QUOTATION MARK
            u'\u2019': "'",             # RIGHT SINGLE QUOTATION MARK
            u'\u2014': "-",             # EM DASH
            u'\u2013': "-",             # EM DASH
            u'\u02DC': "~",             # SMALL TILDE
            u'\u201A': "'",             # SINGLE LOW-9 QUOTATION MARK
            u'\u201E': '"',             # DOUBLE LOW-9 QUOTATION MARK
            u'\u00A0': " ",             # NON-BREAKING SPACE BAR
            u'\u2011': "-",             # NON-BREAKING HYPHEN
            u'\u2022': '-',             # NON-BREAKING HYPHEN
            u'\u20AC': unsupported_str, # EURO SIGN
            u'\u0192': unsupported_str, # LATIN SMALL LETTER F WITH HOOK
            u'\u2026': unsupported_str, # HORIZONTAL ELLIPSIS
            u'\u2020': unsupported_str, # DAGGER
            u'\u2021': unsupported_str, # DOUBLE DAGGER
            u'\u02C6': unsupported_str, # MODIFIER LETTER CIRCUMFLEX ACCENT
            u'\u2030': unsupported_str, # PER MILLE SIGN
            u'\u0160': unsupported_str, # LATIN CAPITAL LETTER S WITH CARON
            u'\u2039': unsupported_str, # SINGLE LEFT-POINTING ANGLE QUOTATION MARK
            u'\u0152': unsupported_str, # LATIN CAPITAL LIGATURE OE
            u'\u017D': unsupported_str, # LATIN CAPITAL LETTER Z WITH CARON
            u'\u2122': unsupported_str, # TRADE MARK SIGN
            u'\u0161': unsupported_str, # LATIN SMALL LETTER S WITH CARON
            u'\u203A': unsupported_str, # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
            u'\u0153': unsupported_str, # LATIN SMALL LIGATURE OE
            u'\u017E': unsupported_str, # LATIN SMALL LETTER Z WITH CARON
            u'\u0178': unsupported_str, # LATIN CAPITAL LETTER Y WITH DIAERESIS
            u'\u0420': unsupported_str, 
            u'\u043E': unsupported_str, 
            u'\u0441': unsupported_str, 
            u'\u0438': unsupported_str, 
            u'\u044F': unsupported_str,
            u'\u0103': unsupported_str
        }

        dirty_string = dirty_string.translate(None, string.punctuation)

        # Loop through the replacement characters, replacing key with the value in the dirty_string
        for k, v in replacement_chars.iteritems():
            #print("Replacing %s -> %s" % (k, v))
            pre_repl = dirty_string
            dirty_string = dirty_string.replace(k, v)

            # If there WAS an update when replacing this character, and the character is an 'unsupported' character, then throw a hissy-fit
            if str(pre_repl) != str(dirty_string) and v == unsupported_str:
            #if pre_repl != dirty_string and v == unsupported_str:
                print "BAD STRING FOUND"
                exit()

        return str(dirty_string)
    
    def _sanatizeList( listData ):
        for subkey, subvalue in enumerate(listData):
            if isinstance( subvalue, str ):
                listData[subkey] = _sanatizeStr( subvalue )
            else:
                listData[subkey] = sanitizeData( subvalue )

        return listData

    def _sanatizeDict( dictData ):
        for key, value in dictData.items():
            if isinstance( value, str ):
                dictData[key] = _sanatizeStr( value )
            else:
                dictData[key] = sanitizeData( value )

        return dictData

    if isinstance( data, list ):
        return _sanatizeList( data )

    if isinstance( data, dict ):
        return _sanatizeDict( data )

    # Commenting this out, because sometimes the type is 'unicode', so isinstance for str fails
    #if isinstance( data, str ):
    #    return _sanatizeStr( data )

    return _sanatizeStr( data )

 dirty_string = raw_input("Enter Dirty String: ")   # Python 2.x

 clean_string = sanitizeData(dirty_string)

 print '\nRESULTS'
 print '{0:<25}: {1:<25}'.format( 'Original (Dirty)', dirty_string )
 print 'Type: %s' % type(dirty_string)
 print '{0:<25}: {1:<25}'.format( 'Modified (Sanitized)', clean_string )
 print 'Type: %s' % type(clean_string)

 # EXAMPLE RESULTS:
 # Wayne’s Candies   ->  Wayne's Candies
 # Quotes: ‘ ’ “ ”   ->  ' ' " "
 # Euro: €123        ->  BAD STRING FOUND
	import fileinput
	import sys
	import string
	reload(sys)
	sys.setdefaultencoding("utf-8")

	def sanitizeData( data ):
	# This might be useful for later
	#if not isinstance( dirty_string, unicode):
	# return data

	def _sanatizeStr( dirty_string ):
	dirty_string = str( dirty_string )

	unsupported_str = 'BADSTRING'

	replacement_chars = {
	u'\u201C': '"', # RIGHT DOUBLE QUOTATION MARK
	u'\u201D': '"', # RIGHT DOUBLE QUOTATION MARK
	u'\u2018': "'", # LEFT SINGLE QUOTATION MARK
	u'\u2019': "'", # RIGHT SINGLE QUOTATION MARK
	u'\u2014': "-", # EM DASH
	u'\u2013': "-", # EM DASH
	u'\u02DC': "~", # SMALL TILDE
	u'\u201A': "'", # SINGLE LOW-9 QUOTATION MARK
	u'\u201E': '"', # DOUBLE LOW-9 QUOTATION MARK
	u'\u00A0': " ", # NON-BREAKING SPACE BAR
	u'\u2011': "-", # NON-BREAKING HYPHEN
	u'\u2022': '-', # NON-BREAKING HYPHEN
	u'\u20AC': unsupported_str, # EURO SIGN
	u'\u0192': unsupported_str, # LATIN SMALL LETTER F WITH HOOK
	u'\u2026': unsupported_str, # HORIZONTAL ELLIPSIS
	u'\u2020': unsupported_str, # DAGGER
	u'\u2021': unsupported_str, # DOUBLE DAGGER
	u'\u02C6': unsupported_str, # MODIFIER LETTER CIRCUMFLEX ACCENT
	u'\u2030': unsupported_str, # PER MILLE SIGN
	u'\u0160': unsupported_str, # LATIN CAPITAL LETTER S WITH CARON
	u'\u2039': unsupported_str, # SINGLE LEFT-POINTING ANGLE QUOTATION MARK
	u'\u0152': unsupported_str, # LATIN CAPITAL LIGATURE OE
	u'\u017D': unsupported_str, # LATIN CAPITAL LETTER Z WITH CARON
	u'\u2122': unsupported_str, # TRADE MARK SIGN
	u'\u0161': unsupported_str, # LATIN SMALL LETTER S WITH CARON
	u'\u203A': unsupported_str, # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
	u'\u0153': unsupported_str, # LATIN SMALL LIGATURE OE
	u'\u017E': unsupported_str, # LATIN SMALL LETTER Z WITH CARON
	u'\u0178': unsupported_str, # LATIN CAPITAL LETTER Y WITH DIAERESIS
	u'\u0420': unsupported_str,
	u'\u043E': unsupported_str,
	u'\u0441': unsupported_str,
	u'\u0438': unsupported_str,
	u'\u044F': unsupported_str,
	u'\u0103': unsupported_str
	}

	dirty_string = dirty_string.translate(None, string.punctuation)

	# Loop through the replacement characters, replacing key with the value in the dirty_string
	for k, v in replacement_chars.iteritems():
	#print("Replacing %s -> %s" % (k, v))
	pre_repl = dirty_string
	dirty_string = dirty_string.replace(k, v)

	# If there WAS an update when replacing this character, and the character is an 'unsupported' character, then throw a hissy-fit
	if str(pre_repl) != str(dirty_string) and v == unsupported_str:
	#if pre_repl != dirty_string and v == unsupported_str:
	print "BAD STRING FOUND"
	exit()

	return str(dirty_string)

	def _sanatizeList( listData ):
	for subkey, subvalue in enumerate(listData):
	if isinstance( subvalue, str ):
	listData[subkey] = _sanatizeStr( subvalue )
	else:
	listData[subkey] = sanitizeData( subvalue )

	return listData

	def _sanatizeDict( dictData ):
	for key, value in dictData.items():
	if isinstance( value, str ):
	dictData[key] = _sanatizeStr( value )
	else:
	dictData[key] = sanitizeData( value )

	return dictData

	if isinstance( data, list ):
	return _sanatizeList( data )

	if isinstance( data, dict ):
	return _sanatizeDict( data )

	# Commenting this out, because sometimes the type is 'unicode', so isinstance for str fails
	#if isinstance( data, str ):
	# return _sanatizeStr( data )

	return _sanatizeStr( data )

	dirty_string = raw_input("Enter Dirty String: ") # Python 2.x

	clean_string = sanitizeData(dirty_string)

	print '\nRESULTS'
	print '{0:<25}: {1:<25}'.format( 'Original (Dirty)', dirty_string )
	print 'Type: %s' % type(dirty_string)
	print '{0:<25}: {1:<25}'.format( 'Modified (Sanitized)', clean_string )
	print 'Type: %s' % type(clean_string)

	# EXAMPLE RESULTS:
	# Wayne’s Candies -> Wayne's Candies
	# Quotes: ‘ ’ “ ” -> ' ' " "
	# Euro: €123 -> BAD STRING FOUND