Created
October 16, 2012 11:01
-
-
Save micktwomey/3898674 to your computer and use it in GitHub Desktop.
WSGI middleware to replace non-BMP characters in JSON with unknown character. Works around poor UTF-8 support in certain dbs.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""WSGI middleware and test code to munge utf-8 | |
To work around narrow builds you need to do this: | |
re.sub(r'\\U[0-9a-f]{8}', '\\ufffd', s.encode("unicode_escape")).decode("unicode_escape") | |
(Use the representation) | |
""" | |
import json | |
import logging | |
import re | |
from wsgiref.simple_server import make_server | |
import webob | |
def hello_world_app(environ, start_response): | |
request = webob.Request(environ) | |
response = webob.Response() | |
if request.content_length: | |
response.body = request.body | |
return response(environ, start_response) | |
def utf8_non_basic_plane_munger(app): | |
def utf8_non_basic_plane_munger_middleware(environ, start_response): | |
logger = logging.getLogger("utf8_non_basic_plane_munger") | |
request = webob.Request(environ) | |
try: | |
if (request.content_type == "application/json") and request.content_length: | |
logger.info("Spotted JSON request, will attempt to re-encode, removing non BMP (0x0000 -> 0xffff) characters.") | |
new_request = request.copy() | |
body = new_request.body.decode("unicode_escape") | |
logger.debug("Decoded JSON body into {!r} (should be unicode)".format(body)) | |
body = body.encode("unicode_escape") | |
logger.debug("Re-encoded body into {!r} (should be a string with escaped unicode)".format(body)) | |
if re.search(r'\\U[0-9a-f]{8}', body): | |
logger.info("Spotted extended characters in request, munging") | |
body = re.sub(r'\\U[0-9a-f]{8}', '\\ufffd', body) | |
logger.debug("Converted body to {!r} (extended characters should be replaced with \\ufffd)".format(body)) | |
body = body.decode("unicode_escape") | |
logger.debug("Decoded body into {!r} (should be decoded into unicode again)".format(body)) | |
body = json.dumps(json.loads(body)) | |
logger.debug("Re-dumped body into {!r} (should be a JSON blob again)".format(body)) | |
new_request.body = body | |
request = new_request | |
else: | |
logger.info("Didn't spot extended characters in request, leaving alone") | |
except: | |
logger.exception("Problem munging characters in request, leaving alone") | |
response = request.get_response(app) | |
return response(environ, start_response) | |
return utf8_non_basic_plane_munger_middleware | |
if __name__ == '__main__': | |
logging.basicConfig(level=logging.DEBUG) | |
app = utf8_non_basic_plane_munger(hello_world_app) | |
httpd = make_server('', 8000, app) | |
print "Serving on port 8000..." | |
# Serve until process is killed | |
httpd.serve_forever() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment