Jookia · January 29, 2020 22:46
diff --git a/shiftjis.py b/shiftjis.py
 # Originally written on 2017-04-26 19:03.

 # Welcome to my fancy Shift JIS explanation and simple implementation in Python.
 #
 # If you haven't heard of Shift JIS, it's a single or double-byte encoding for
 # the JIS X 0201 and JIS X 0208 character sets, a bit like UTF-8 is for Unicode.
 #
 # The key feature of Shift JIS is being able to use both character sets at once,
 # retaining legacy half-width ASCII and Katakana characters from JIS X 0201.
 #
 # First we're going to implement the JIS X 0201 standard. ASCII[1] only uses 7
 # bits, meaning only 128 characters are defined. JIS X 0201 uses 8 bits like
 # most ASCII variants, meaning it has space for 256 instead.[2] ASCII variants
 # are generally split to two halves: The 'lower' half and 'upper' half.
 #
 # For JIS X 0201, the lower half is a modified ASCII layout. It contains control
 # characters which are used to change the text layout and make your terminal go
 # beep and printable characters which you're reading right now.
 #
 # The upper half (which gets its name by having its first of eight bits set)
 # contains some symbols and Katakana, 62 in total of the available 127 spaces.
 # JIS X 0211 adds more control characters[3] but we won't be implementing that.
 #
 # [1]: https://en.wikipedia.org/wiki/ASCII
 # [2]: https://en.wikipedia.org/wiki/JIS_X_0201
 # [3]: https://en.wikipedia.org/wiki/JIS_X_0211

 # Code time! For simplicity's sake (it'll become important later), characters
 # will be specified in hexadecimal. This is useful since you can line them up
 # with the Wikipedia graphs but also instantly tell if a number fits in to a
 # single byte since a single byte is two hexadecimal digits long. I'll also do
 # my best to make any range checks look like a number line. Error checking is
 # important, which is why this code throws a lot of exceptions if something
 # exceptional happens, as is the Python way.
 #
 # In addition, most the functions (if not all) are going to be tested using
 # exhaustive test functions. The idea is to knowingly generate some invalid
 # inputs and some valid inputs then compare to see if our functions can tell the
 # difference properly. In a lot of cases this will be done just by quickly
 # looping through all possible characters in a search space, as well as invalid
 # ones to verify that those are picked up too. When the tests fail, they'll just
 # print out some variables (see the source code to decipher them.)
 #
 # Overall, this source file is going to be divided on a per-codec basis, so
 # let's get started with JIS X 0201.

 # You'll notice there's no jisx0201_encode or decode functions. Since JIS X 0201
 # can fit in a byte, there's no need for any. It maps to itself nicely.

 def jisx0201_valid(char):
  ascii  = (0x00 <= char and char <= 0x7F)
  custom = (0xA1 <= char and char <= 0xDF)
  return (ascii or custom)

 def test_jisx0201_valid():
  char = -0x100
  while char <= 0x100:
    valid = jisx0201_valid(char)
    try:
      # This encoding should always be correct, so decode errors are just about
      # invalid characters and not invalid encodings.
      enc = bytes([char])
      enc.decode("shiftjis")
      realValid = True
    except ValueError as e:
      # Converting to either bytes or Unicode failed, either way it's invalid.
      realValid = False
    if valid != realValid:
      print("%s %x %s" % (valid, char, realValid))
      return False
    char += 0x01
  return True
 print("test_jisx0201_valid: %s" % (test_jisx0201_valid()))

 # JIS X 0208 requires us to handle double bytes, so let's set up some functions
 # to be able to convert numbers to and from tuples of bytes like (b1, b2).

 def byte_valid(byte):
  return (0x00 <= byte and byte <= 0xFF)

 def test_byte_valid():
  b = -0x100
  while b <= 0x100:
    valid = byte_valid(b)
    if (b < 0x00 or 0xFF < b) and valid:
      print("%x" % (b))
      return False
    b += 0x01
  return True
 print("test_byte_valid: %s" % (test_byte_valid()))

 # Now we need some functions to pack and unpack two bytes. In hexadecimal, a
 # byte is two digits, so this code is equivalent to the decimal version of
 # multiplying a two digits by 100 then adding the second number below back.

 def db_pack(b1, b2):
  if byte_valid(b1) and byte_valid(b2):
    return ((b1 * 0x100) + b2)
  else:
    raise ValueError("db_pack: Invalid byte(s) %x %x" % (b1, b2))

 def db_unpack(bytes):
  if 0xFFFF < bytes:
    raise ValueError("db_unpack: 'bytes' larger than two bytes" % (bytes))
  
  # To unpack bytes divide by 100 to get the first digits then modulo to get the
  # remainder for the last two digits.
  return (bytes // 0x100, bytes % 0x100)

 def test_dbpacking():
  b1 = -0x10
  while b1 < 0x110:
    b2 = -0x10
    while b2 <= 0x110:
      valid = (byte_valid(b1) and byte_valid(b2))
      try:
        packed   = db_pack(b1, b2)
        (b3, b4) = db_unpack(packed)
        
        if b1 != b3 or b2 != b4:
          print("%x-%x %x %x-%x" % (b1, b2, packed, b3, b4))
          return False
        realValid = True
      except ValueError as e:
        realValid = False
      
      if valid != realValid:
        print("%x-%x" % (b1, b2))
      b2 +=1
    b1 += 1
  return True
 print("test_dbpacking: %s" % (test_dbpacking()))

 # So far it's pretty simple. Next up we need to look at implementing some
 # functions for JIS X 0208.[4] Unlike Unicode which is one-dimensional with
 # sections marked as planes, JIS X 0208 is divided in to a 94x94 grid, with each
 # character being put in a particular row in a particular cell.
 #
 # Why 94? The reason is ISO/IECC 2022.[5] Instead of using 8 bits like JIS X
 # 0201, JIS X 0208 and some other standards define their character characters by
 # fitting their rows and cells in to two bytes compatible with 7-bit printable
 # ASCII. Control characters can then be used to switch between different
 # encodings in the middle of text. Because of this, a JIS X 0208 character is
 # actually two bytes stuck together in the printable ASCII range (0x20 - 0x7E,
 # 0x7F is 'delete' for some reason) rather than being a continuous set like
 # Unicode. This means that the character '0x2121' is actually row 1, cell 1 and
 # the character '0x7E7E' is row 94, 94.
 #
 # It also means that if we were to lay out the characters continuously for every
 # number like Unicode does, we'd find that each row (say, 0x21XX) is actually a
 # byte long, containing 256 with only 94 used, ignoring 162. This tradeoff is
 # made so that you can take any JIS X 0208 character and display it as mangled
 # ASCII text rather than random control characters that make your text mess up.
 # For example, row 3, cell 16 is '０' (converted to Unicode) or '#0' in mangled
 # ASCII which can then be converted back to JIS X 0208.
 #
 # There's a table of JIS X 0208 characters online[6] which shows all the rows
 # and cells. Don't be fooled, the cells don't have their own rows and columns
 # (confusingly the Wikipedia page seems to imply this), that's just for display.
 # In it you can see the JIS character code and the Shift JIS encoded character,
 # and a few interesting issues: The entire character set is wide, not narrow
 # like JIS X 0201, and it contains unallocated blocks. The first issue is why
 # people switch between JIS X 0201 and JIS X 0208, and the second issue is why a
 # lot of systems still use Shift JIS: They use these blocks for themselves.
 #
 # [4]: https://en.wikipedia.org/wiki/JIS_X_0208
 # [5]: https://en.wikipedia.org/wiki/ISO/IEC_2022
 # [6]: http://www.asahi-net.or.jp/~AX2S-KMTN/ref/jisx0208.html

 # For understanding's sake, we're going to deal with JIS X 0208 by specifying
 # rows and cells rather than encoded characters in hexadecimal.

 def jisx0208_valid(row, cell):
  valid_row  = (1 <= row  and row  <= 94)
  valid_cell = (1 <= cell and cell <= 94)
  return (valid_row and valid_cell)

 def jisx0208_encode(row, cell):
  if jisx0208_valid(row, cell):
    # Add 0x20 to align to ASCII printable characters, then pack it.
    return db_pack(row + 0x20, cell + 0x20)
  else:
    raise ValueError("jisx0208_encode: invalid character %i,%i" % (row, cell))

 def jisx0208_decode(character):
  (row, cell) = db_unpack(character)
  return (row - 0x20, cell - 0x20)

 def test_jisx0208_codec():
  row = -10
  while row <= 100:
    cell = -10
    while cell <= -100:
      valid = ((1 <= row and row <= 94) and (1 <= cell and cell <= 94))
      try:
        character = jisx0208_encode(row, cell)
        (row2, cell2) = jisx0208_decode(character)
        
        if row != row2 or cell != cell2:
          print("%i,%i %x %i,%i" % (row, cell, character, row2, cell2))
          return False
        realValid = True
      except ValueError as e:
        realValid = False
      
      if valid != realValid:
        print("%i-%i" % (row, cell))
      cell +=1
    row += 1
  return True
 print("test_jisx0208_codec: %s" % (test_jisx0208_codec()))

 # Ok, now that we can encode both character sets it's time to encode Shift JIS!
 # First, let's study the map of Shift JIS.[7] Take a moment to drink it in.
 #
 # We can see that Shift JIS is a superset of JIS X 0201, rather than 7-bit
 # ASCII. The first byte can either be JIS X 0201 or the start of a shifted JIS X
 # 0208 character. It gets its name from 'shifting' the first byte of its
 # encoding around the upper half of JIS X 0201 in the unallocated 65 characters.
 #
 # When encoding, the first byte only uses 47 of the 65 characters that could be
 # used for encoding, and the second byte maps 188 characters for use. If you add
 # them together, 47x188 and 94x94 are both 8836, so all together there's enough
 # room to store a JIS X 0208 character.
 #
 # The key to Shift JIS is how it redistributes the bits of a JIS X 0208
 # character: You can't fit the row number in to one of the 65 characters, so
 # part of the number needs to be stored in the second byte. It does this by
 # moving whether the row is odd or even from a bit in the row number to a
 # position-based indication in the second byte. This halves the first byte's
 # needed characters to 47 and doubles the second byte's to 188.
 #
 # It's worth noting that unlike ASCII or UTF-8, there's no bit packing or
 # masking that allow you to extract the character directly from the bytes.
 # Instead, there's some weird offsets that actually affect the code: The first
 # byte starts being encoded at 0x81 instead of 0x80. All the other indices start
 # at 0, but this one just starts at 1 for some reason. I don't know why.
 #
 # The second byte is a lot stranger: The section for odd rows starts at 0x40,
 # meaning the code now needs to skip the 0x7F 'DEL' control code. It also
 # overlaps with JIS X 0211 control codes, meaning it could actually mess up your
 # terminal unlike a 7-bit encoding. Moving that section back to 0x20 would solve
 # both issues. Additionally, the section for even rows starts straight after at
 # 0x9F. Moving that forward to 0xA0 would mean you could check determine if the
 # row is odd by whether the 8th bit is set instead of checking a range.
 #
 # All of this hassle gets us an encoding that supports multiple character sets
 # without the stateful control codes used in ISO/IEC 2022 and backwards
 # compatibility with JIS X 0201 text. As far as I can tell, a much nicer
 # encoding (EUC-JP)[8] was available since 1993 but instead Shift JIS was
 # standardized by Microsoft in 1997 just to have that backwards compatibility.
 # Interestingly enough, Microsoft uses a non-standard extension of Shift JIS.[9]
 #
 # [7]: https://en.wikipedia.org/wiki/Shift_JIS#Shift_JIS_byte_map
 # [8]: https://en.wikipedia.org/wiki/Extended_Unix_Code#EUC-JP
 # [9]: https://en.wikipedia.org/wiki/Code_page_943

 def shiftjis_encode_jisx0208(character):
  (row, cell) = jisx0208_decode(character)
  
  if not jisx0208_valid(row, cell):
    raise ValueError("shiftjis_encode: invalid character %i,%i" % (row, cell))
  
  # For the first byte...
  b1 = row + 1 # Add 1 so the encoding starts at 0x81.
  b1 = b1 // 2 # Shrink the value to fit, losing the odd/even bit.
  
  # Shift the byte based on whether it can fit behind the JIS X 0201 symbols.
  if b1 < 32:
    b1 = b1 + 0x80
  else:
    b1 = b1 + 0xE0
    b1 = b1 - 32 # Bytes ahead of the symbols are implied to start at 32.
  
  # For the second byte...
  row_odd = (row % 2 == 1) # Check if the row is odd.
  offset = 0x40 if row_odd else 0x9F # Pick the indicative offset.
  b2 = offset + cell - 1 # Shift to the appropriate location, cells start at 1.
  
  # For odd rows, shift past the 'DEL' control character at 0x7F.
  if row_odd and 0x7F <= b2:
    b2 += 1
  
  return db_pack(b1, b2)

 def shiftjis_decode_jisx0208(db):
  (b1, b2) = db_unpack(db)
  
  # For the first byte...
  if 0x81 <= b1 and b1 <= 0x9F:
    row = b1 - 0x80 # Shift its value from behind JIS X 0201 symbols.
  elif 0xE0 <= b1 and b1 <= 0xFF:
    row = b1 - 0xE0 # Shift its value from ahead of the symbols.
    row = row + 32 # Add 32 to compensate for values ahead starting from 0.
  else:
    raise ValueError("shiftjis_decode_jisx0208: invalid first byte %x" % (b1))
  
  # Expand the row back to an even number. While dividing by 2 would floor the
  # value meaning the odd number corresponding to this cell would be next,
  # adding 1 so the encoding starts at 0x81 means it actually gets round up to
  # the ceiling, making the odd number below this number.
  row = row * 2
  
  # For the second byte...
  if 0x40 <= b2 and b2 <= 0x9E: # Odd row!
    row = row - 1 # Set to corresponding odd number.
    cell = b2 - 0x40
    
    # Compensate for shifting past the 'DEL' control character.
    if 0x7F < b2:
      cell = cell - 1
    elif b2 == 0x7F:
      raise ValueError("shiftjis_decode_jisx0208: invalid second byte %x" % (b2))
  elif 0x9F <= b2 and b2 <= 0xFC:
    cell = b2 - 0x9F
  else:
    raise ValueError("shiftjis_decode_jisx0208: invalid second byte %x" % (b2))
  
  # Shift forward since cells start at 1.
  cell = cell + 1
  
  return jisx0208_encode(row, cell)

 # Make sure the decoder doesn't accept rubbish input.
 def test_shiftjis_jisx0208_decoder():
  b1 = -0x10
  while b1 <= 100:
    b2 = -0x10
    while b2 <= 100:
      bs_valid = (byte_valid(b1) and byte_valid(b2))
      b1_valid = ((0x81 <= b1 and b1 <= 0x9F) or (0xE0 <= b1 and b1 <= 0xFF))
      b2_valid = ((0x40 <= b2 and b1 <= 0x9E) or (0x9F <= b1 and b1 <= 0xFC))
      b2_not_del = (b2 != 0x7F)
      valid = (bs_valid and b1_valid and b2_valid and b2_not_del)
      
      try:
        enc = db_pack(b1, b2)
        decoded = shiftjis_decode_jisx0208(enc)
        realValid = True
        
        if valid != realValid:
          print("%x-%x %x %x %s" % (b1, b2, enc, decoded, realValid))
          return False
      except ValueError as e:
        realValid = False
      
      if valid != realValid:
        print("%x-%x %s" % (b1, b2, realValid))
        return False
      b2 +=1
    b1 += 1
  return True
 print("test_shiftjis_jisx0208_decoder: %s" % (test_shiftjis_jisx0208_decoder()))

 # Make sure the encoder and decoder don't mangle the data.
 def test_shiftjis_jisx0208_encoder():
  row = 1
  while row <= 94:
    cell = 1
    while cell <= 94:
      valid = True
      try:
        character = jisx0208_encode(row, cell)
        enc = shiftjis_encode_jisx0208(character)
        character2 = shiftjis_decode_jisx0208(enc)
        (row2, cell2) = jisx0208_decode(character2)
        
        if row != row2 or cell != cell2 or character != character2:
          print("%i,%i %x %x %x %i,%i" %
            (row, cell, character, enc, character2, row2, cell2))
          return False
        realValid = True
      except ValueError as e:
        realValid = False
      
      if valid != realValid:
        print("%i,%i %s" % (row, cell, realValid))
        return False
      cell +=1
    row += 1
  return True
 print("test_shiftjis_jisx0208_encoder: %s" % (test_shiftjis_jisx0208_encoder()))

 # Okay, so that's the hard part done. We now have the ability to encode and
 # decode text in the two ways Shift JIS does: JIS X 0201 and mangled JIS X 0208.
 # Now all we need is to write the Shift JIS bytes encoder! The idea is that the
 # accepted input will be encoded characters from either JIS X 0201 or JIS X 0208
 # and output will be bytes, and vice-versa for decoding. These are just simple
 # wrappers and don't need much error checking since the other functions do that.

 def shiftjis_encode(characters):
  encoded = bytearray()
  
  for character in characters:
    if jisx0201_valid(character):
      encoded.append(character)
    else:
      (row, cell) = jisx0208_decode(character)
      
      if jisx0208_valid(row, cell):
        (b1, b2) = db_unpack(shiftjis_encode_jisx0208(character))
        encoded.append(b1)
        encoded.append(b2)
      else:
        raise ValueError("shiftjis_encode: invalid character %x" % (character))
  
  return encoded

 def shiftjis_decode(bytes):
  characters = []
  head = None
  
  for byte in bytes:
    if head == None:
      if jisx0201_valid(byte):
        characters.append(byte)
      else:
        head = byte
    else:
      encoded = db_pack(head, byte)
      decoded = shiftjis_decode_jisx0208(encoded)
      characters.append(decoded)
      head = None
  
  if head != None: # Trailing first byte?
    raise ValueError("shiftjis_decode: trailing first byte %x" % (head))
  
  return characters

 # Testing the mix of JIS X 0201 and JIS X 0208 characters is a bit hard without
 # fuzz testing, and the tests I've been doing so far have just been exhaustive.
 # I don't want to write a random testing framework right now, so let's just use
 # traditional unit tests with some examples that should shake out bad code.

 def test_shiftjis_codec():
  valid_tests = [
    # Just ASCII.
    [ord('H'), ord('e'), ord('l'), ord('l'), ord('o'), 0x00],
    # ASCII with a JIS X 0208 character.
    [ord('H'), ord('e'), ord('y'), jisx0208_encode(0x1, 0xA), 0x00],
    # JIS X 0201 characters with a JIS X 0208 character.
    [0xA2, 0xA3, jisx0208_encode(0x1, 0xA), 0x00]
  ]
  
  for test in valid_tests:
    try:
      enc = shiftjis_encode(test)
      dec = shiftjis_decode(enc)
    except ValueError as e:
      print("%s" % (test))
      return False
  
  # Have to have invalid tests made up of bytes to avoid all the safety checks
  # in the functions we've created.
  
  invalid_tests = [
    # Invalid second byte.
    bytearray(b'hello!\x81\x10\x00'),
    # Unused first bytes.
    bytearray(b'hello!\x80\x10\x21'),
    bytearray(b'hello!\x80\xFE\x21'),
    # Unused second bytes.
    bytearray(b'hello!\x80\x81\x30'),
    bytearray(b'hello!\x80\x81\xFF'),
    # Trailing first byte.
    bytearray(b'hello!\x91')
  ]
  
  for test in invalid_tests:
    try:
      dec = shiftjis_decode(test)
      print("%s" % (test))
      return False
    except ValueError as e:
      pass
  return True
 print("test_shiftjis_codec: %s" % (test_shiftjis_codec()))

 # Whew! Everything should be fine, and the encoder should work. Of course, it's
 # still possible that it's outputting garbage that it can somehow encode and
 # decode losslessly. (I really doubt that, though.) But just in case there's
 # more development to do, we're going to do a final test by using Python's built
 # in encoder/decoder to do a round trip for comparison.
 #
 # A small snag is that Python will fail characters that can't be converted to
 # Unicode, even if they can be encoded in Shift JIS (this includes private use
 # characters like emoji). The solution to this is for our tests to check if the
 # character in a row or cell is allocated in the standard, since the standard
 # has since been incorporated to Unicode.

 def jisx0208_allocated(row, cell):
  if not jisx0208_valid(row, cell):
    return False
  
  if (8 < row and row < 16) or (84 < row): # Unassigned rows.
    return False
  
  # These are all transcribed based on the JIS 0208 chart listed earlier.
  unallocated = {
    '2': [(15, 25), (34, 41), (49, 59), (75, 81), (90, 93)],
    '3': [(1, 15), (26, 32), (59, 64), (91, 94)],
    '4': [(84, 94)],
    '5': [(87, 94)],
    '6': [(25, 32), (57, 94)],
    '7': [(34, 48), (82, 94)],
    '8': [(33, 94)],
    '47': [(52, 94)],
    '84': [(7, 95)]}
  
  if unallocated.get(str(row)):
    for i in unallocated.get(str(row)):
      if i[0] <= cell and cell <= i[1]:
        return False
  
  return True

 def test_shiftjis_external():
  # Loop through all the rows and make sure that we can do a round trip of
  # encoding and decoding of valid characters.
  
  row = 1
  while row <= 94:
    points = []
    
    cell = 0
    while cell <= 94:
      if jisx0208_allocated(row, cell):
        encoded = jisx0208_encode(row, cell)
        points.append(encoded)
      cell += 1
    
    # Mix in some JIS X 0201 characters!
    points.append(ord('H'))
    points.append(ord('e'))
    points.append(ord('y'))
    points.append(0xA2)
    points.append(0xA3)
    
    try:
      enc = shiftjis_encode(points)
      enc2 = enc.decode("shift_jis").encode("shift_jis")
      
      if enc != enc2:
        print("%s %s %s %s" % (points, enc, dec, dec2))
        return False
    except Exception as e:
      print("%s" % (points))
    
    row += 1
  return True
 print("test_shiftjis_external: %s" % (test_shiftjis_external()))

 # Well, that's all. I hope you learned something today. If you want to mess with
 # the code some more, I suggest adding support for Microsoft's Code Page 943
 # variant and other non-standard variants. I think some of them just add more
 # rows so it only works in Shift JIS and not ISO/IECC 2022, or use unallocated
 # rows. There's also JIS X 0213 which expands the character set in to two
 # planes, but I haven't found much documentation on how this works since I can't
 # read Japanese. I hope if I've gotten any point to you across, it's that Shift
 # JIS isn't that scary once you stop using formulas and decimal numbers. The
 # only bizarre moments come from the choice of offsets. See you later!

 # The author of this file has dedicated its contents to the public domain
 # using the CC0 Public Domain Dedication 1.0. For full legal information see
 # <https://creativecommons.org/publicdomain/zero/1.0/>.