Created
November 22, 2018 05:49
-
-
Save bogdanRada/77a7eda45e41ccafa5be48199d662a36 to your computer and use it in GitHub Desktop.
Understanding encoding through examples in Ruby
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # encoding: UTF-8 | |
| utf8_resume = "Résumé" | |
| latin1_resume = utf8_resume.encode("ISO-8859-1") | |
| latin9_resume = utf8_resume.encode("ISO-8859-15") | |
| lower_spanish = "\u00E1 \u00E9 \u00ED \u00F3 \u00FA \u00F1".encode("UTF-8") | |
| puts lower_spanish | |
| upper_spanish = "\u00C1 \u00C9 \u00CD \u00D3 \u00DA \u00D1".encode("UTF-8") | |
| puts upper_spanish | |
| puts 'Display each byte (hex)' | |
| utf8_resume.bytesize.times {|i| printf "%X ", utf8_resume.getbyte(i)} | |
| puts "= #{utf8_resume} encoded as UTF-8" | |
| latin1_resume.bytesize.times {|i| printf "%X ", latin1_resume.getbyte(i)} | |
| puts "= #{latin1_resume.encode('UTF-8')} encoded as ISO-8859-1" | |
| latin9_resume.bytesize.times {|i| printf "%X ", latin9_resume.getbyte(i)} | |
| puts "= #{latin9_resume.encode('UTF-8')} encoded as ISO-8859-15" | |
| puts | |
| puts 'Display each character as a Unicode codepoint (hex)' | |
| utf8_resume.each_codepoint {|c| printf "%X ", c} | |
| puts "= #{utf8_resume} encoded as UTF-8" | |
| latin1_resume.each_codepoint {|c| printf "%X ", c} | |
| puts "= #{utf8_resume} encoded as ISO-8859-1" | |
| latin9_resume.each_codepoint {|c| printf "%X ", c} | |
| puts "= #{utf8_resume} encoded as ISO-8859-15" | |
| puts | |
| utf8_money = "\u{20AC A4 A3 A5}" | |
| latin1_money = "\u{A4 A3 A5}".encode("ISO-8859-1") | |
| latin9_money = "\u{20AC A3 A5}".encode("ISO-8859-15") | |
| puts 'Display each byte (hex)' | |
| utf8_money.bytesize.times {|i| printf "%X ", utf8_money.getbyte(i)} | |
| puts "= #{utf8_money} encoded as UTF-8" | |
| latin1_money.bytesize.times {|i| printf "%X ", latin1_money.getbyte(i)} | |
| puts "= #{latin1_money.encode('UTF-8')} encoded as ISO-8859-1 (20AC is invalid)" | |
| latin9_money.bytesize.times {|i| printf "%X ", latin9_money.getbyte(i)} | |
| puts "= #{latin9_money.encode('UTF-8')} encoded as ISO-8859-15 (A4 is invalid, 20AC gets converted into A4)" | |
| puts | |
| puts 'Display each character as a Unicode codepoint (hex)' | |
| utf8_money.each_codepoint {|c| printf "%X ", c} | |
| puts "= #{utf8_money} encoded as UTF-8" | |
| latin1_money.each_codepoint {|c| printf "%X ", c} | |
| puts "= #{latin1_money.encode('UTF-8')} encoded as ISO-8859-1 (20AC is invalid)" | |
| latin9_money.each_codepoint {|c| printf "%X ", c} | |
| puts "= #{latin9_money.encode('UTF-8')} encoded as ISO-8859-15 (A4 is invalid, 20AC gets converted into A4)" | |
| puts | |
| puts <<EOT | |
| NOTE: | |
| \u00A4 (called currency symbol) in Latin-1 changed to \u20AC in Latin-9 | |
| http://en.wikipedia.org/wiki/ISO/IEC_8859-15 | |
| EOT |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment