aratak · June 16, 2016 16:30
diff --git a/symbols.rb b/symbols.rb
     # It might seem like we should more correctly reject these sequences in
      # the encoder, and I would personally agree, but the sad reality is that
      # we do not distinguish binary and textual data in our language, and so we
      # wind up with the same thing - a string - containing both.
      #
      # That leads to the position where we must treat these invalid sequences,
      # which are both legitimate binary content, and illegitimate potential
      # attacks on the system, as something that passes through correctly in
      # a string

    [
      "\xC0\xAE",                 # over-long UTF-8 '.' character
      "\xC0\x80",                 # over-long NULL byte
      "\xC0\xFF",
      "\xC1\xAE",
      "\xC1\x80",
      "\xC1\xFF",
      "\x80",                     # first continuation byte
      "\xbf",                     # last continuation byte
      # all possible continuation bytes in one shot
      "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8A\x8B\x8C\x8D\x8E\x8F" +
      "\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9A\x9B\x9C\x9D\x9E\x9F" +
      "\xA0\xA1\xA2\xA3\xA4\xA5\xA6\xA7\xA8\xA9\xAA\xAB\xAC\xAD\xAE\xAF" +
      "\xB0\xB1\xB2\xB3\xB4\xB5\xB6\xB7\xB8\xB9\xBA\xBB\xBC\xBD\xBE\xBF",
      # lonely start characters - first, all possible two byte sequences
      "\xC0 \xC1 \xC2 \xC3 \xC4 \xC5 \xC6 \xC7 \xC8 \xC9 \xCA \xCB \xCC \xCD \xCE \xCF " +
      "\xD0 \xD1 \xD2 \xD3 \xD4 \xD5 \xD6 \xD7 \xD8 \xD9 \xDA \xDB \xDC \xDD \xDE \xDF ",
      # and so for three byte sequences, four, five, and six, as follow.
      "\xE0 \xE1 \xE2 \xE3 \xE4 \xE5 \xE6 \xE7 \xE8 \xE9 \xEA \xEB \xEC \xED \xEE \xEF ",
      "\xF0 \xF1 \xF2 \xF3 \xF4 \xF5 \xF6 \xF7 ",
      "\xF8 \xF9 \xFA \xFB ",
      "\xFC \xFD ",
      # sequences with the last byte missing
      "\xC0", "\xE0", "\xF0\x80\x80", "\xF8\x80\x80\x80", "\xFC\x80\x80\x80\x80",
      "\xDF", "\xEF\xBF", "\xF7\xBF\xBF", "\xFB\xBF\xBF\xBF", "\xFD\xBF\xBF\xBF\xBF",
      # impossible bytes
      "\xFE", "\xFF", "\xFE\xFE\xFF\xFF",
      # over-long '/' character
      "\xC0\xAF",
      "\xE0\x80\xAF",
      "\xF0\x80\x80\xAF",
      "\xF8\x80\x80\x80\xAF",
      "\xFC\x80\x80\x80\x80\xAF",
      # maximum overlong sequences
      "\xc1\xbf",
      "\xe0\x9f\xbf",
      "\xf0\x8f\xbf\xbf",
      "\xf8\x87\xbf\xbf\xbf",
      "\xfc\x83\xbf\xbf\xbf\xbf",
      # overlong NUL
      "\xc0\x80",
      "\xe0\x80\x80",
      "\xf0\x80\x80\x80",
      "\xf8\x80\x80\x80\x80",
      "\xfc\x80\x80\x80\x80\x80",
    ]
	# It might seem like we should more correctly reject these sequences in
	# the encoder, and I would personally agree, but the sad reality is that
	# we do not distinguish binary and textual data in our language, and so we
	# wind up with the same thing - a string - containing both.
	#
	# That leads to the position where we must treat these invalid sequences,
	# which are both legitimate binary content, and illegitimate potential
	# attacks on the system, as something that passes through correctly in
	# a string

	[
	"\xC0\xAE", # over-long UTF-8 '.' character
	"\xC0\x80", # over-long NULL byte
	"\xC0\xFF",
	"\xC1\xAE",
	"\xC1\x80",
	"\xC1\xFF",
	"\x80", # first continuation byte
	"\xbf", # last continuation byte
	# all possible continuation bytes in one shot
	"\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8A\x8B\x8C\x8D\x8E\x8F" +
	"\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9A\x9B\x9C\x9D\x9E\x9F" +
	"\xA0\xA1\xA2\xA3\xA4\xA5\xA6\xA7\xA8\xA9\xAA\xAB\xAC\xAD\xAE\xAF" +
	"\xB0\xB1\xB2\xB3\xB4\xB5\xB6\xB7\xB8\xB9\xBA\xBB\xBC\xBD\xBE\xBF",
	# lonely start characters - first, all possible two byte sequences
	"\xC0 \xC1 \xC2 \xC3 \xC4 \xC5 \xC6 \xC7 \xC8 \xC9 \xCA \xCB \xCC \xCD \xCE \xCF " +
	"\xD0 \xD1 \xD2 \xD3 \xD4 \xD5 \xD6 \xD7 \xD8 \xD9 \xDA \xDB \xDC \xDD \xDE \xDF ",
	# and so for three byte sequences, four, five, and six, as follow.
	"\xE0 \xE1 \xE2 \xE3 \xE4 \xE5 \xE6 \xE7 \xE8 \xE9 \xEA \xEB \xEC \xED \xEE \xEF ",
	"\xF0 \xF1 \xF2 \xF3 \xF4 \xF5 \xF6 \xF7 ",
	"\xF8 \xF9 \xFA \xFB ",
	"\xFC \xFD ",
	# sequences with the last byte missing
	"\xC0", "\xE0", "\xF0\x80\x80", "\xF8\x80\x80\x80", "\xFC\x80\x80\x80\x80",
	"\xDF", "\xEF\xBF", "\xF7\xBF\xBF", "\xFB\xBF\xBF\xBF", "\xFD\xBF\xBF\xBF\xBF",
	# impossible bytes
	"\xFE", "\xFF", "\xFE\xFE\xFF\xFF",
	# over-long '/' character
	"\xC0\xAF",
	"\xE0\x80\xAF",
	"\xF0\x80\x80\xAF",
	"\xF8\x80\x80\x80\xAF",
	"\xFC\x80\x80\x80\x80\xAF",
	# maximum overlong sequences
	"\xc1\xbf",
	"\xe0\x9f\xbf",
	"\xf0\x8f\xbf\xbf",
	"\xf8\x87\xbf\xbf\xbf",
	"\xfc\x83\xbf\xbf\xbf\xbf",
	# overlong NUL
	"\xc0\x80",
	"\xe0\x80\x80",
	"\xf0\x80\x80\x80",
	"\xf8\x80\x80\x80\x80",
	"\xfc\x80\x80\x80\x80\x80",
	]