Created
May 3, 2017 18:35
-
-
Save LinuxBozo/451db2dce7f2728195ea0901ac6208bb to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def scrubMicrosoftChars(scrubbedString="", charset='utf-8'): | |
""" Repair Microsoft Special Characters by mapping to standard characters """ | |
if not scrubbedString: | |
return "" | |
charset = charset.lower() | |
scrubbable = ['iso-8859-1', 'windows-1252'] | |
if charset not in scrubbable: | |
# since it's not a "friendly" charset, don't scrub, just return it | |
return scrubbedString | |
# if the string is already unicode, then we can can skip the | |
# unicode encoding step. Most likely this string is the title | |
# of the default page which is the same as the title for the | |
# folder, which has already been converted | |
if not isinstance(scrubbedString, unicode): | |
scrubbedString = unicode(scrubbedString, charset, errors='replace') | |
# : null | |
scrubbedString = scrubbedString.replace(u"\x00", "") # UTF-8 | |
scrubbedString = scrubbedString.replace(u"\u0000", "") # Unicode | |
# : backspace | |
scrubbedString = scrubbedString.replace(u"\x08", "") # UTF-8 | |
scrubbedString = scrubbedString.replace(u"\u0008", "") # Unicode | |
# : line tabulation | |
scrubbedString = scrubbedString.replace(u"\x0B", "") # UTF-8 | |
scrubbedString = scrubbedString.replace(u"\u000B", "") # Unicode | |
# page break: form feed | |
scrubbedString = scrubbedString.replace(u"\x0C", "\n") # UTF-8 | |
scrubbedString = scrubbedString.replace(u"\u000C", "") # Unicode | |
# : shift out | |
scrubbedString = scrubbedString.replace(u"\x0E", "") # UTF-8 | |
scrubbedString = scrubbedString.replace(u"\u000F", "") # Unicode | |
# : shift iun | |
scrubbedString = scrubbedString.replace(u"\x0F", "") # UTF-8 | |
scrubbedString = scrubbedString.replace(u"\u000F", "") # Unicode | |
# : data link escape | |
scrubbedString = scrubbedString.replace(u"\x10", "") # UTF-8 | |
scrubbedString = scrubbedString.replace(u"\u0010", "") # Unicode | |
# : device control one | |
scrubbedString = scrubbedString.replace(u"\x11", "") # UTF-8 | |
scrubbedString = scrubbedString.replace(u"\u0011", "") # Unicode | |
# : device control two | |
scrubbedString = scrubbedString.replace(u"\x12", "") # UTF-8 | |
scrubbedString = scrubbedString.replace(u"\u0012", "") # Unicode | |
# : device control three | |
scrubbedString = scrubbedString.replace(u"\x13", "") # UTF-8 | |
scrubbedString = scrubbedString.replace(u"\u0013", "") # Unicode | |
# : device control four | |
scrubbedString = scrubbedString.replace(u"\x14", "") # UTF-8 | |
scrubbedString = scrubbedString.replace(u"\u0014", "") # Unicode | |
# : negative acknowledgement | |
scrubbedString = scrubbedString.replace(u"\x15", "") # UTF-8 | |
scrubbedString = scrubbedString.replace(u"\u0015", "") # Unicode | |
# : synchronous idle | |
scrubbedString = scrubbedString.replace(u"\x16", "") # UTF-8 | |
scrubbedString = scrubbedString.replace(u"\u0016", "") # Unicode | |
# : end of transmission block | |
scrubbedString = scrubbedString.replace(u"\x17", "") # UTF-8 | |
scrubbedString = scrubbedString.replace(u"\u0017", "") # Unicode | |
# : cancel | |
scrubbedString = scrubbedString.replace(u"\x18", "") # UTF-8 | |
scrubbedString = scrubbedString.replace(u"\u0018", "") # Unicode | |
# : end of medium | |
scrubbedString = scrubbedString.replace(u"\x19", "") # UTF-8 | |
scrubbedString = scrubbedString.replace(u"\u0019", "") # Unicode | |
# : substitute | |
scrubbedString = scrubbedString.replace(u"\x1A", "") # UTF-8 | |
scrubbedString = scrubbedString.replace(u"\u001A", "") # Unicode | |
# : escape | |
scrubbedString = scrubbedString.replace(u"\x1B", "") # UTF-8 | |
scrubbedString = scrubbedString.replace(u"\u001B", "") # Unicode | |
# left double quotation mark: information separator four | |
scrubbedString = scrubbedString.replace(u"\x1C", "\"") # UTF-8 | |
scrubbedString = scrubbedString.replace(u"\u001C", "\"") # Unicode | |
# right double quotation mark: information separator three | |
scrubbedString = scrubbedString.replace(u"\x1D", "\"") # UTF-8 | |
scrubbedString = scrubbedString.replace(u"\u001D", "\"") # Unicode | |
# : information separator two | |
scrubbedString = scrubbedString.replace(u"\x1E", "") # UTF-8 | |
scrubbedString = scrubbedString.replace(u"\u001E", "") # Unicode | |
# : information separator one | |
scrubbedString = scrubbedString.replace(u"\x1F", "") # UTF-8 | |
scrubbedString = scrubbedString.replace(u"\u001F", "") # Unicode | |
# quotation mark | |
scrubbedString = scrubbedString.replace(u"\x22", "\"") # UTF-8 | |
scrubbedString = scrubbedString.replace(u"\u0022", "\"") # Unicode | |
scrubbedString = scrubbedString.replace(u"\u02BA", "\"") # Unicode | |
scrubbedString = scrubbedString.replace(u"\xCA\xBA", "\"") #Unicode to UTF-8 | |
scrubbedString = scrubbedString.replace(u"\u02DD", "\"") # Unicode | |
scrubbedString = scrubbedString.replace(u"\xCA\x9D", "\"") #Unicode to UTF-8 | |
scrubbedString = scrubbedString.replace(u"\u030B", "\"") # Unicode | |
scrubbedString = scrubbedString.replace(u"\xCC\x8B", "\"") #Unicode to UTF-8 | |
scrubbedString = scrubbedString.replace(u"\u2033", "\"") # Unicode | |
scrubbedString = scrubbedString.replace(u"\xE2\x80\xB3", "\"") #Unicode to UTF-8 | |
scrubbedString = scrubbedString.replace(u"\u3003", "\"") # Unicode | |
scrubbedString = scrubbedString.replace(u"\xE3\x80\x83", "\"") #Unicode to UTF-8 | |
scrubbedString = scrubbedString.replace(u"\u2036", "\"") # Unicode | |
# apostrophe | |
scrubbedString = scrubbedString.replace(u"\x27", "'") # UTF-8 | |
scrubbedString = scrubbedString.replace(u"\u0027", "'") # Unicode | |
scrubbedString = scrubbedString.replace(u"\u02BC", "'") # Unicode | |
scrubbedString = scrubbedString.replace(u"\xCA\xBC", "'") #Unicode to UTF-8 | |
scrubbedString = scrubbedString.replace(u"\u0313", "'") # Unicode | |
scrubbedString = scrubbedString.replace(u"\xCC\x93", "'") #Unicode to UTF-8 | |
scrubbedString = scrubbedString.replace(u"\u0315", "'") # Unicode | |
scrubbedString = scrubbedString.replace(u"\xCC\x95", "'") #Unicode to UTF-8 | |
# grave accent | |
scrubbedString = scrubbedString.replace(u"\x60", "`") # UTF-8 | |
scrubbedString = scrubbedString.replace(u"\u0060", "`") # Unicode | |
scrubbedString = scrubbedString.replace(u"\u02CB", "`") # Unicode | |
scrubbedString = scrubbedString.replace(u"\xCB\x8B", "`") #Unicode to UTF-8 | |
scrubbedString = scrubbedString.replace(u"\u0300", "`") # Unicode | |
scrubbedString = scrubbedString.replace(u"\xCC\x80", "`") #Unicode to UTF-8 | |
scrubbedString = scrubbedString.replace(u"\u2035", "`") # Unicode | |
scrubbedString = scrubbedString.replace(u"\xE2\x80\xB5", "`") #Unicode to UTF-8 | |
# low single quotation mark: break permitted here | |
scrubbedString = scrubbedString.replace(u"\u0082", "'") # Unicode | |
scrubbedString = scrubbedString.replace(u"\xC2\x82", "'") #Unicode to UTF-8 | |
scrubbedString = scrubbedString.replace(u"\u201A", "'") # Unicode | |
scrubbedString = scrubbedString.replace(u"\xE2\x80\x9A", "'") #Unicode to UTF-8 | |
# acute accent | |
scrubbedString = scrubbedString.replace(u"\u00B4", "'") # Unicode | |
scrubbedString = scrubbedString.replace(u"\xC2\xB4", "'") #Unicode to UTF-8 | |
scrubbedString = scrubbedString.replace(u"\u02CA", "'") # Unicode | |
scrubbedString = scrubbedString.replace(u"\xCB\x8A", "'") #Unicode to UTF-8 | |
scrubbedString = scrubbedString.replace(u"\u0301", "'") # Unicode | |
scrubbedString = scrubbedString.replace(u"\xCC\x81", "'") #Unicode to UTF-8 | |
# high single quotation mark | |
scrubbedString = scrubbedString.replace(u"\u201B", "'") # Unicode | |
scrubbedString = scrubbedString.replace(u"\xE2\x80\x9B", "'") #Unicode to UTF-8 | |
# low double quotation mark: | |
scrubbedString = scrubbedString.replace(u"\u0084", "\"") # Unicode | |
scrubbedString = scrubbedString.replace(u"\xC2\x84", "\"") #Unicode to UTF-8 | |
scrubbedString = scrubbedString.replace(u"\u201E", "\"") # Unicode | |
scrubbedString = scrubbedString.replace(u"\xE2\x80\x9E", "\"") #Unicode to UTF-8 | |
# high double quotation maruk | |
scrubbedString = scrubbedString.replace(u"\u201F", "\"") # Unicode | |
scrubbedString = scrubbedString.replace(u"\xE2\x80\x9F", "\"") #Unicode to UTF-8 | |
# left single quotation mark: private use one | |
scrubbedString = scrubbedString.replace(u"\u0091", "'") # Unicode | |
scrubbedString = scrubbedString.replace(u"\xC2\x91", "'") #Unicode to UTF-8 | |
scrubbedString = scrubbedString.replace(u"\u2018", "'") # Unicode | |
scrubbedString = scrubbedString.replace(u"\xE2\x80\x98", "'") #Unicode to UTF-8 | |
# right single quotation mark: private use two | |
scrubbedString = scrubbedString.replace(u"\u0092", "'") # Unicode | |
scrubbedString = scrubbedString.replace(u"\xC2\x92", "'") #Unicode to UTF-8 | |
scrubbedString = scrubbedString.replace(u"\u2019", "'") # Unicode | |
scrubbedString = scrubbedString.replace(u"\xE2\x80\x99", "'") #Unicode to UTF-8 | |
# left double quotation mark: set transmit state | |
scrubbedString = scrubbedString.replace(u"\u0093", "\"") # Unicode | |
scrubbedString = scrubbedString.replace(u"\xC2\x93", "\"") #Unicode to UTF-8 | |
scrubbedString = scrubbedString.replace(u"\u201C", "\"") # Unicode | |
scrubbedString = scrubbedString.replace(u"\xE2\x80\x9C", "\"") #Unicode to UTF-8 | |
# right double quotation mark: cancel Character | |
scrubbedString = scrubbedString.replace(u"\u0094", "\"") # Unicode | |
scrubbedString = scrubbedString.replace(u"\xC2\x94", "\"") #Unicode to UTF-8 | |
scrubbedString = scrubbedString.replace(u"\u201D", "\"") # Unicode | |
scrubbedString = scrubbedString.replace(u"\xE2\x80\x9D", "\"") #Unicode to UTF-8 | |
# bullet: message waiting | |
scrubbedString = scrubbedString.replace(u"\u0095", "-") # Unicode | |
scrubbedString = scrubbedString.replace(u"\xC2\x95", "-") #Unicode to UTF-8 | |
# bullet: start of guarded area | |
scrubbedString = scrubbedString.replace(u"\u0096", "-") # Unicode | |
scrubbedString = scrubbedString.replace(u"\xC2\x96", "-") #Unicode to UTF-8 | |
# primue | |
scrubbedString = scrubbedString.replace(u"\u02B9", "'") # Unicode | |
scrubbedString = scrubbedString.replace(u"\xCA\xB9", "'") #Unicode to UTF-8 | |
scrubbedString = scrubbedString.replace(u"\u2032", "'") # Unicode | |
scrubbedString = scrubbedString.replace(u"\xE2\x80\xB2", "'") #Unicode to UTF-8 | |
# double apostrophe | |
scrubbedString = scrubbedString.replace(u"\u02EE", "\"") # Unicode | |
scrubbedString = scrubbedString.replace(u"\xCB\xAE", "\"") #Unicode to UTF-8 | |
# en dash | |
scrubbedString = scrubbedString.replace(u"\u2013", "-") # Unicode | |
scrubbedString = scrubbedString.replace(u"\xE2\x80\x93", "-") #Unicode to UTF-8 | |
# em dash | |
scrubbedString = scrubbedString.replace(u"\u2014", "-") # Unicode | |
scrubbedString = scrubbedString.replace(u"\xE2\x80\x94", "-") #Unicode to UTF-8 | |
#return scrubbedString.encode('utf-8', 'replace') | |
return scrubbedString |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment