Skip to content

Instantly share code, notes, and snippets.

@msukmanowsky
Created June 23, 2014 20:59
Show Gist options
  • Save msukmanowsky/0cde15bfdb7164359af2 to your computer and use it in GitHub Desktop.
Save msukmanowsky/0cde15bfdb7164359af2 to your computer and use it in GitHub Desktop.
Monkey patches needed to fix a bug in how Unicode percent-encoded strings are handled in Python's unquote function.
import urlparse
import urllib
import urllib2
def patch_unquote():
urllib.unquote = unquote
urllib2.unquote = unquote
urlparse.unquote = unquote
def unquote_to_bytearray(s):
parts = s.split("%")
if len(parts) == 1:
return s # no unquoting needed
result = bytearray(parts[0])
# Process pair-wise hex characters and add them to the byte array
for item in parts[1:]:
try:
result.append(int(item[:2], 16)) # hex part
for x in item[2:]:
result.append(x) # non-hex part
except ValueError:
# Invalid hex digit, copy over invalid value verbatim
result.append("%")
for x in item:
result.append(x)
return result
def unquote(s, encoding="utf-8", errors="replace"):
"""Unquote a percent-encoded string."""
if isinstance(s, unicode):
s = s.encode(encoding, errors)
barray = unquote_to_bytearray(s)
return barray.decode(encoding, errors)
barray = unquote_to_bytearray(s)
return str(barray)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment