Skip to content

Instantly share code, notes, and snippets.

@loic
Created July 27, 2014 11:15
Show Gist options
  • Save loic/27ac306c3ebfc50ea6de to your computer and use it in GitHub Desktop.
Save loic/27ac306c3ebfc50ea6de to your computer and use it in GitHub Desktop.
diff --git a/django/utils/encoding.py b/django/utils/encoding.py
index 6a2f877..21712bc 100644
--- a/django/utils/encoding.py
+++ b/django/utils/encoding.py
@@ -1,3 +1,4 @@
+# -*- encoding: utf-8 -*-
from __future__ import unicode_literals
import codecs
@@ -188,7 +189,9 @@ def iri_to_uri(iri):
assuming input is either UTF-8 or unicode already, we can simplify things a
little from the full method.
- Returns an ASCII string containing the encoded result.
+ Takes an IRI in UTF-8 bytes (e.g. '/I \xe2\x99\xa5 Django/') or unicode
+ (e.g. '/I ♥ Django/') and returns ASCII bytes containing the encoded result
+ (e.g. '/I%20%E2%99%A5%20Django/').
"""
# The list of safe characters here is constructed from the "reserved" and
# "unreserved" characters specified in sections 2.2 and 2.3 of RFC 3986:
@@ -213,7 +216,9 @@ def uri_to_iri(uri):
Resource Identifier(IRI).
This is the algorithm from section 3.2 of RFC 3987.
- Returns a valid IRI utf-8 encoded bytes.
+
+ Takes an URI in ASCII bytes (e.g. '/I%20%E2%99%A5%20Django/') and returns
+ UTF-8 bytes containing the encoded result (e.g. '/I \xe2\x99\xa5 Django/').
"""
if uri is None:
return uri
diff --git a/tests/utils_tests/test_encoding.py b/tests/utils_tests/test_encoding.py
index fc3bc24..7f16d96 100644
--- a/tests/utils_tests/test_encoding.py
+++ b/tests/utils_tests/test_encoding.py
@@ -40,12 +40,6 @@ class TestEncodingUtils(unittest.TestCase):
today = datetime.date.today()
self.assertEqual(force_bytes(today, strings_only=True), today)
- def test_filepath_to_uri(self):
- self.assertEqual(filepath_to_uri('upload\\чубака.mp4'),
- 'upload/%D1%87%D1%83%D0%B1%D0%B0%D0%BA%D0%B0.mp4')
- self.assertEqual(filepath_to_uri('upload\\чубака.mp4'.encode('utf-8')),
- 'upload/%D1%87%D1%83%D0%B1%D0%B0%D0%BA%D0%B0.mp4')
-
@unittest.skipIf(six.PY3, "tests a class not defining __str__ under Python 2")
def test_decorated_class_without_str(self):
with self.assertRaises(ValueError):
@@ -53,65 +47,71 @@ class TestEncodingUtils(unittest.TestCase):
class NoStr(object):
pass
- def test_iri_to_uri(self):
- self.assertEqual(iri_to_uri('red%09ros\xe9#red'),
- 'red%09ros%C3%A9#red')
- self.assertEqual(iri_to_uri('/blog/for/J\xfcrgen M\xfcnster/'),
- '/blog/for/J%C3%BCrgen%20M%C3%BCnster/')
+class TestRFC3987IEncodingUtils(unittest.TestCase):
+
+ def test_filepath_to_uri(self):
+ self.assertEqual(filepath_to_uri('upload\\чубака.mp4'),
+ 'upload/%D1%87%D1%83%D0%B1%D0%B0%D0%BA%D0%B0.mp4')
+ self.assertEqual(filepath_to_uri('upload\\чубака.mp4'.encode('utf-8')),
+ 'upload/%D1%87%D1%83%D0%B1%D0%B0%D0%BA%D0%B0.mp4')
+
+ def test_iri_to_uri(self):
+ cases = [
+ # Valid UTF-8 sequences are encoded.
+ ('red%09rosé#red', 'red%09ros%C3%A9#red'),
+ ('/blog/for/Jürgen Münster/', '/blog/for/J%C3%BCrgen%20M%C3%BCnster/'),
+ ('locations/%s' % urlquote_plus('Paris & Orléans'), 'locations/Paris+%26+Orl%C3%A9ans'),
- self.assertEqual(iri_to_uri('locations/%s' % urlquote_plus('Paris & Orl\xe9ans')),
- 'locations/Paris+%26+Orl%C3%A9ans')
+ # Reserved chars remain unescaped.
+ ('%&', '%&'),
+ ('red&♥ros%#red', 'red&%E2%99%A5ros%#red'),
+ ]
- def test_iri_to_uri_idempotent(self):
- self.assertEqual(iri_to_uri(iri_to_uri('red%09ros\xe9#red')),
- 'red%09ros%C3%A9#red')
+ for iri, uri in cases:
+ self.assertEqual(iri_to_uri(iri), uri)
- def test_iri_to_uri_reserved(self):
- self.assertEqual(iri_to_uri('%&'), '%&')
- self.assertEqual(iri_to_uri('red&♥ros%#red'),
- 'red&%E2%99%A5ros%#red')
+ # Test idempotency.
+ self.assertEqual(iri_to_uri(iri_to_uri(iri)), uri)
def test_uri_to_iri(self):
- def test(uri):
- return uri_to_iri(uri).decode('utf-8')
-
- self.assertEqual(test('~%A9helloworld'), '~%A9helloworld')
- self.assertEqual(test('d%AAo%AAw%AAn%AAl%AAo%AAa%AAd%AA/'), 'd%AAo%AAw%AAn%AAl%AAo%AAa%AAd%AA/')
- self.assertEqual(test('/%E2%99%E2%99%A5/'), '/%E2%99♥/')
- self.assertEqual(test('/%E2%99%A5'), '/♥')
- self.assertEqual(test('/%E2%98%80%E2%99%A5/'), '/☀♥/')
- self.assertEqual(test('/%E2%98%8E%E2%A9%E2%99%A5/'), '/☎%E2%A9♥/')
- self.assertEqual(test('/%E2%99%BF%99☃%E2%99%A3%E2%98%BD%A9'), '/♿%99☃♣☽%A9')
- self.assertEqual(test('/%E2%98%90/fred?utf8=%E2%9C%93'), '/☐/fred?utf8=✓')
- self.assertEqual(test('/☐/fred?utf8=☓'), '/☐/fred?utf8=☓')
- self.assertEqual(test('/üsername'), '/üsername')
- self.assertEqual(test('/üser:pässword@☃'), '/üser:pässword@☃')
- self.assertEqual(test('/%3Fmeh?foo=%26%A9'), '/?meh?foo=&%A9')
- self.assertEqual(test('/%E2%A8%87%87%A5%E2%A8%A0'), '/⨇%87%A5⨠')
- self.assertEqual(test('/你好'), '/你好')
-
- def test_uri_to_iri_idempotent(self):
- def test(uri):
- return uri_to_iri(uri).decode('utf-8')
-
- self.assertEqual(test(test('~%A9helloworld')), '~%A9helloworld')
- self.assertEqual(test(test('d%AAo%AAw%AAn%AAl%AAo%AAa%AAd%AA/')), 'd%AAo%AAw%AAn%AAl%AAo%AAa%AAd%AA/')
- self.assertEqual(test(test('/%E2%99%E2%99%A5/')), '/%E2%99♥/')
- self.assertEqual(test(test('/%E2%99%A5')), '/♥')
-
- def test_complementary(self):
- def test_iri_to_iri(iri):
+ cases = [
+ # Valid UTF-8 sequences are decoded.
+ ('/%E2%99%A5%E2%99%A5/', '/♥♥/'),
+ ('/%E2%99%A5%E2%99%A5/?utf8=%E2%9C%93', '/♥♥/?utf8=✓'),
+
+ # Broken UTF-8 sequences remain escaped.
+ ('/%AAd%AAj%AAa%AAn%AAg%AAo%AA/', '/%AAd%AAj%AAa%AAn%AAg%AAo%AA/'),
+ ('/%E2%99%A5%E2%E2%99%A5/', '/♥%E2♥/'),
+ ('/%E2%99%A5%E2%99%E2%99%A5/', '/♥%E2%99♥/'),
+ ('/%E2%E2%99%A5%E2%99%A5%99/', '/%E2♥♥%99/'),
+ ('/%E2%99%A5%E2%99%A5/?utf8=%9C%93%E2%9C%93%9C%93', '/♥♥/?utf8=%9C%93✓%9C%93'),
+ ]
+
+ for uri, iri in cases:
iri = iri.encode('utf-8')
- self.assertEqual(uri_to_iri(iri_to_uri(iri)), iri)
- def test_uri_to_uri(uri):
+ self.assertEqual(uri_to_iri(uri), iri)
+
+ # Test idempotency.
+ self.assertEqual(uri_to_iri(uri_to_iri(uri)), iri)
+
+ def test_complementarity(self):
+ cases = [
+ ('/blog/for/J%C3%BCrgen%20M%C3%BCnster/', '/blog/for/J\xfcrgen M\xfcnster/'),
+ ('%&', '%&'),
+ ('red&%E2%99%A5ros%#red', 'red&♥ros%#red'),
+ ('/%E2%99%A5%E2%99%A5/', '/♥♥/'),
+ ('/%E2%99%A5%E2%99%A5/?utf8=%E2%9C%93', '/♥♥/?utf8=✓'),
+ ('/%AAd%AAj%AAa%AAn%AAg%AAo%AA/', '/%AAd%AAj%AAa%AAn%AAg%AAo%AA/'),
+ ('/%E2%99%A5%E2%E2%99%A5/', '/♥%E2♥/'),
+ ('/%E2%99%A5%E2%99%E2%99%A5/', '/♥%E2%99♥/'),
+ ('/%E2%E2%99%A5%E2%99%A5%99/', '/%E2♥♥%99/'),
+ ('/%E2%99%A5%E2%99%A5/?utf8=%9C%93%E2%9C%93%9C%93', '/♥♥/?utf8=%9C%93✓%9C%93'),
+ ]
+
+ for uri, iri in cases:
+ iri = iri.encode('utf-8')
self.assertEqual(iri_to_uri(uri_to_iri(uri)), uri)
-
- test_iri_to_iri('~%A9helloworld')
- test_iri_to_iri('/üser:pässword@☃')
- test_iri_to_iri('/你好')
-
- test_uri_to_uri('~%A9helloworld')
- test_uri_to_uri('/%E2%99%A5')
- test_uri_to_uri('/%E2%98%80%E2%99%A5')
+ self.assertEqual(uri_to_iri(iri_to_uri(iri)), iri)
+#
\ No newline at end of file
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment