Created
June 23, 2014 20:59
-
-
Save msukmanowsky/0cde15bfdb7164359af2 to your computer and use it in GitHub Desktop.
Monkey patches needed to fix a bug in how Unicode percent-encoded strings are handled in Python's unquote function.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import urlparse | |
import urllib | |
import urllib2 | |
def patch_unquote(): | |
urllib.unquote = unquote | |
urllib2.unquote = unquote | |
urlparse.unquote = unquote | |
def unquote_to_bytearray(s): | |
parts = s.split("%") | |
if len(parts) == 1: | |
return s # no unquoting needed | |
result = bytearray(parts[0]) | |
# Process pair-wise hex characters and add them to the byte array | |
for item in parts[1:]: | |
try: | |
result.append(int(item[:2], 16)) # hex part | |
for x in item[2:]: | |
result.append(x) # non-hex part | |
except ValueError: | |
# Invalid hex digit, copy over invalid value verbatim | |
result.append("%") | |
for x in item: | |
result.append(x) | |
return result | |
def unquote(s, encoding="utf-8", errors="replace"): | |
"""Unquote a percent-encoded string.""" | |
if isinstance(s, unicode): | |
s = s.encode(encoding, errors) | |
barray = unquote_to_bytearray(s) | |
return barray.decode(encoding, errors) | |
barray = unquote_to_bytearray(s) | |
return str(barray) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment