Created
May 8, 2019 15:11
-
-
Save jvanasco/ae35ee9347c444d11ebd05844d1b2de4 to your computer and use it in GitHub Desktop.
a test harness showing an edge case scenario with bytes and string encoding from a python2 to python3 port.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
from __future__ import print_function | |
""" | |
This test harness showcases an odd scenario when providing compatibility | |
with Python2 and Python3 data. | |
The input to a function is a URL, which in Python2 might have been: | |
url_unicode = u'http://➡.ws/♥' | |
url_string = 'http://\xe2\x9e\xa1.ws/\xe2\x99\xa5' | |
url_bytes = 'http://\xe2\x9e\xa1.ws/\xe2\x99\xa5' # b prefix is allowed | |
or in Python3 as: | |
url_unicode = 'http://➡.ws/♥' # u prefix is allowed | |
url_string = 'http://\xe2\x9e\xa1.ws/\xe2\x99\xa5' | |
url_bytes = b'http://\xe2\x9e\xa1.ws/\xe2\x99\xa5' | |
While these all represent the same url in different encodings, it is a bit of | |
a hassle to ensure the correct decoding/encoding of the input when it is unknown | |
across both Python2 and Python3 (see `test_unknown_input`). | |
This should be an edgecase for most people. This only popped up because a test | |
suite failed in a python2-3 port that ensured a handful of encodings/decodings | |
would create the same output, as it was in a Service Oriented Architecture | |
application and the input could be from another system. | |
Expected results | |
python2 encodings_test.py | |
pass: TestEncodings.test_unknown_input | |
TestEncodings.test_w3lib_to_unicode__fail3_expected | |
fail: TestEncodings.test_w3lib_bypass_1__fail23_expected | |
TestEncodings.test_w3lib_bypass_2__fail23_expected | |
python3 encodings_test.py | |
pass: TestEncodings.test_unknown_input | |
fail: TestEncodings.test_w3lib_to_unicode__fail3_expected | |
TestEncodings.test_w3lib_bypass_1__fail23_expected | |
TestEncodings.test_w3lib_bypass_2__fail23_expected | |
""" | |
import unittest | |
import six | |
import pdb | |
from w3lib.url import safe_url_string | |
# ============================================================================== | |
def is_ascii(s): | |
return all(ord(c) < 128 for c in s) | |
def w3lib_to_unicode(text, encoding=None, errors='strict'): | |
# this is lifted as-is from w3lib; https://github.com/scrapy/w3lib/blob/master/w3lib/util.py | |
# there is is 'to_unicode' | |
# this is where the error in safe_url_string traces down to | |
# included for test prurposes | |
"""Return the unicode representation of a bytes object `text`. If `text` | |
is already an unicode object, return it as-is.""" | |
if isinstance(text, six.text_type): | |
return text | |
if not isinstance(text, (bytes, six.text_type)): | |
raise TypeError('to_unicode must receive a bytes, str or unicode ' | |
'object, got %s' % type(text).__name__) | |
if encoding is None: | |
encoding = 'utf-8' | |
return text.decode(encoding, errors) | |
class TestEncodings(unittest.TestCase): | |
url_unicode = u'http://➡.ws/♥' | |
url_string = 'http://\xe2\x9e\xa1.ws/\xe2\x99\xa5' | |
url_bytes = b'http://\xe2\x9e\xa1.ws/\xe2\x99\xa5' | |
url_safe = 'http://xn--hgi.ws/%E2%99%A5' | |
url_bad_1 = 'http://â\x9e¡.ws/â\x99¥' | |
inputs = (('unicode', url_unicode, ), | |
('string', url_string, ), | |
('url_bytes', url_bytes, ), | |
) | |
def test_unknown_input(self): | |
# in python2 there is a difference between unicode and str; in python3 there is not | |
# in python3 there is a difference between str and bytes; in python2 there is not | |
# aka, fml | |
# print("test_unknown_input") | |
for (input_type, input) in self.inputs: | |
_url_candidate = input | |
if six.PY3: | |
if isinstance(input, bytes): | |
_url_candidate = input | |
else: | |
_decoded = input.encode('raw_unicode_escape') | |
_decoded_encoded = input.encode('raw_unicode_escape').decode() | |
if (is_ascii(_decoded_encoded)): | |
# input: self.url_unicode | |
_url_candidate = input.encode('utf-8') | |
else: | |
# input: self.url_string | |
_url_candidate = _decoded_encoded | |
else: | |
_url_candidate = input.encode('utf-8') if isinstance(input, unicode) else input | |
result = safe_url_string(_url_candidate) | |
self.assertEqual(result, self.url_safe) | |
def test_w3lib_to_unicode__fail3_expected(self): | |
fails = [] | |
for (input_type, input) in self.inputs: | |
_url_candidate = input | |
result = w3lib_to_unicode(input) | |
if result != self.url_unicode: | |
fails.append((input_type, input, result)) | |
if fails: | |
if False: | |
print("test_w3lib_to_unicode") | |
for fail in fails: | |
print(fail[0], fail[1]) | |
pdb.set_trace() | |
raise ValueError("result != self.url_unicode: %s" % fails) | |
def test_w3lib_bypass_1__fail23_expected(self): | |
""" | |
in this example: | |
unicode b'http://\xe2\x9e\xa1.ws/\xe2\x99\xa5' | |
string b'http://\xc3\xa2\xc2\x9e\xc2\xa1.ws/\xc3\xa2\xc2\x99\xc2\xa5' | |
url_bytes http://➡.ws/♥ | |
""" | |
fails = [] | |
for (input_type, input) in self.inputs: | |
_decoded = None | |
if isinstance(input, six.text_type): | |
_decoded = input.encode('utf-8') | |
elif isinstance(input, six.binary_type): | |
try: | |
_decoded = input.decode() | |
except UnicodeDecodeError as exc: # Py2 catch | |
_decoded = input | |
if _decoded != self.url_unicode: | |
fails.append((input_type, input, _decoded)) | |
if fails: | |
if False: | |
print("test_w3lib_bypass_1") | |
for fail in fails: | |
print(fail[0], fail[1]) | |
pdb.set_trace() | |
raise ValueError("result != self.url_unicode: %s" % fails) | |
def test_w3lib_bypass_2__fail23_expected(self): | |
""" | |
in this example: | |
unicode http://➡.ws/♥ | |
string http://â¡.ws/⥠| |
url_bytes http://➡.ws/♥ | |
""" | |
fails = [] | |
for (input_type, input) in self.inputs: | |
_decoded = None | |
if isinstance(input, six.text_type): | |
_decoded = input.encode('utf-8') | |
elif isinstance(input, six.binary_type): | |
try: | |
_decoded = input.decode() | |
except UnicodeDecodeError as exc: # Py2 catch | |
_decoded = input | |
if _decoded != self.url_unicode: | |
fails.append((input_type, input, _decoded)) | |
if fails: | |
if False: | |
print("test_w3lib_bypass_2") | |
for fail in fails: | |
print(fail[0], fail[1]) | |
pdb.set_trace() | |
raise ValueError("result != self.url_unicode: %s" % fails) | |
if __name__ == '__main__': | |
unittest.main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment