Skip to content

Instantly share code, notes, and snippets.

@zshamrock
Last active January 15, 2016 10:17
Show Gist options
  • Save zshamrock/7bb49d5f10d93a2b2580 to your computer and use it in GitHub Desktop.
Save zshamrock/7bb49d5f10d93a2b2580 to your computer and use it in GitHub Desktop.
General:
http://www.joelonsoftware.com/articles/Unicode.html
Unicode:
http://www.unicode.org/standard/WhatIsUnicode.html
http://www.unicode.org/history/unicode88.pdf
http://unicode.org/charts/
http://unicode.org/cldr/utility/character.jsp
https://en.wikipedia.org/wiki/List_of_Unicode_characters
UTF:
http://www.utf-8.com/
http://www.cl.cam.ac.uk/~mgk25/ucs/utf-8-history.txt
http://www.ietf.org/rfc/rfc3629.txt
http://www.unicode.org/versions/Unicode7.0.0/ch03.pdf#G7404
http://unicode.org/faq/utf_bom.html
Java:
http://docs.oracle.com/javase/7/docs/technotes/guides/intl/overview.html
http://www.oracle.com/technetwork/articles/java/supplementary-142654.html
Go:
https://blog.golang.org/strings
https://github.com/paulrosania/go-charset
Normalization:
https://blog.golang.org/normalization
https://docs.oracle.com/javase/8/docs/api/java/text/Normalizer.html
http://en.wikipedia.org/wiki/Unicode_equivalence
http://unicode.org/reports/tr15/
package main
// https://play.golang.org/p/XydB2AQ1dV
import (
"fmt"
"unicode/utf8"
"unicode/utf16"
"bytes"
)
func main() {
r := int32(0x1030c)
r1, r2 := utf16.EncodeRune(r)
fmt.Printf("%U %U %v %v %v\n", r1, r2, utf16.IsSurrogate(r), utf16.IsSurrogate(r1), utf16.IsSurrogate(r2)) // => U+D800 U+DF0C false true true
buf := bytes.Buffer{}
buf.WriteRune(r1)
buf.WriteRune(r2)
fmt.Printf("U+1030C is %s %X %X\n", buf.String(), r1, r2) // => U+1030C is �� D800 DF0C
bb := make([]byte, 4)
utf8.EncodeRune(bb, r)
buf.Reset()
buf.Write(bb)
fmt.Printf("U+1030C is %s, raw bytes %X %v\n", buf.String(), bb, bb) // => U+1030C is 𐌌, raw bytes F0908C8C [240 144 140 140]
}
package experiments;
import java.io.PrintStream;
import java.nio.charset.StandardCharsets;
import java.text.Normalizer;
public class Unicode {
public static void main(String[] args) {
System.out.printf("%X%n", Character.codePointAt("世界", 0)); // => 4E16
System.out.printf("%X%n", Character.codePointAt("世界", 1)); // => 754C
byte[] helloWorld = new byte[]{0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x2c, 0x20,
(byte) 0xe4, (byte) 0xb8, (byte) 0x96, (byte) 0xe7, (byte) 0x95, (byte) 0x8c};
System.out.printf("Byte array length: %d\n\n", helloWorld.length); // => Byte array length: 13
String utf8 = new String(helloWorld, StandardCharsets.UTF_8);
print("UTF-8", utf8); // => UTF-8 : Hello, 世界 Length 9
String ascii = new String(helloWorld, StandardCharsets.US_ASCII);
print("ASCII", ascii); // => ASCII : Hello, ������ Length 13
String utf16 = new String(helloWorld, StandardCharsets.UTF_16);
print("UTF-16", utf16); // => UTF-16 : 䡥汬漬⃤뢖� Length 7
byte[] helloWorld16 = new byte[]{0x00, 0x48, 0x00, 0x65, 0x00, 0x6c, 0x00, 0x6c, 0x00, 0x6f, 0x00, 0x2c, 0x00, 0x20,
0x4e, 0x16, 0x75, 0x4c};
String utf16Correct = new String(helloWorld16, StandardCharsets.UTF_16);
print("UTF-16", utf16Correct); // => UTF-16 : Hello, 世界 Length 9
byte[] helloWorld16BE = new byte[]{(byte)0xfe, (byte)0xff, 0x00, 0x48, 0x00, 0x65, 0x00, 0x6c, 0x00, 0x6c, 0x00, 0x6f, 0x00, 0x2c, 0x00, 0x20,
0x4e, 0x16, 0x75, 0x4c};
String utf16BE1 = new String(helloWorld16BE, StandardCharsets.UTF_16);
print("UTF-16 BE1", utf16BE1); // => UTF-16 BE1: Hello, 世界 Length 9
String utf16BE2 = new String(helloWorld16BE, StandardCharsets.UTF_16BE);
print("UTF-16 BE2", utf16BE2); // => UTF-16 BE2: Hello, 世界 Length 10
String utf16LE1 = new String(helloWorld16BE, StandardCharsets.UTF_16LE);
print("UTF-16 LE1", utf16LE1); // => UTF-16 LE1: �䠀攀氀氀漀Ⰰ ᙎ䱵 Length 10
byte[] helloWorld16LE = new byte[]{(byte)0xff, (byte)0xfe, 0x48, 0x00, 0x65, 0x00, 0x6c, 0x00, 0x6c, 0x00, 0x6f, 0x00, 0x2c, 0x00, 0x20, 0x00,
0x16, 0x4e, 0x4c, 0x75};
String utf16LE2 = new String(helloWorld16LE, StandardCharsets.UTF_16);
print("UTF-16 LE2", utf16LE2); // => UTF-16 LE2: Hello, 世界 Length 9
String utf16LE3 = new String(helloWorld16LE, StandardCharsets.UTF_16LE);
print("UTF-16 LE3", utf16LE3); // => UTF-16 LE3: Hello, 世界 Length 10
// every char is 2 bytes (16 bit)
System.out.printf("Hello, world is %s\n", "\u0048\u0065\u006c\u006c\u006f\u002c\u0020\u4e16\u754c"); // => Hello, world is Hello, 世界
System.out.printf("World is %s\n", "世界"); // => World is 世界
// surrogates
System.out.println(new String(new byte[] {(byte) 0xd8, 0x00, (byte) 0xdf, 0x0c}, StandardCharsets.UTF_16));
System.out.println("\ud800\udf02");
char hs = Character.highSurrogate(0x10302);
char ls = Character.lowSurrogate(0x10302);
System.out.printf("%x %x\n", (int) hs, (int) ls); // => d800 df02
// normalization
String accentA = "\u00c1";
String decomposedAccentA = "\u0041\u0301";
System.out.printf("%s logical eq %s, but alone \u0041, and accent alone \u0301, but eq? %b\n", accentA, decomposedAccentA, accentA.equals(decomposedAccentA)); // => Á logical eq Á, but alone A, and accent alone ́, but eq? false
String ffi = "\ufb03";
String compatibleDecomposedFFI = "\u0066\u0066\u0069";
System.out.printf("%s logical/semantically eq %s, but eq? %b\n", ffi, compatibleDecomposedFFI, ffi.equals(compatibleDecomposedFFI)); // => ffi logical/semantically eq ffi, but eq? false
System.out.println(Normalizer.normalize(accentA, Normalizer.Form.NFD).length()); // => 2
System.out.println(Normalizer.normalize(accentA, Normalizer.Form.NFC).length()); // => 1
System.out.println(Normalizer.normalize(decomposedAccentA, Normalizer.Form.NFD).length()); // => 2
System.out.println(Normalizer.normalize(decomposedAccentA, Normalizer.Form.NFC).length()); // => 1
System.out.println(Normalizer.normalize(ffi, Normalizer.Form.NFKD).length()); // => 3
System.out.println(Normalizer.normalize(ffi, Normalizer.Form.NFKC).length()); // => 3
System.out.println(Normalizer.normalize(compatibleDecomposedFFI, Normalizer.Form.NFKD).length()); // => 3
System.out.println(Normalizer.normalize(compatibleDecomposedFFI, Normalizer.Form.NFKC).length()); // => 3
System.out.println(Normalizer.normalize(ffi, Normalizer.Form.NFD).length()); // => 1
System.out.println(Normalizer.normalize(ffi, Normalizer.Form.NFC).length()); // => 1
System.out.println(Normalizer.normalize(compatibleDecomposedFFI, Normalizer.Form.NFD).length()); // => 3
System.out.println(Normalizer.normalize(compatibleDecomposedFFI, Normalizer.Form.NFC).length()); // => 3
System.out.println(accentA.equals(Normalizer.normalize(decomposedAccentA, Normalizer.Form.NFC))); // => true
System.out.println(Normalizer.normalize(ffi, Normalizer.Form.NFKC).equals(Normalizer.normalize(compatibleDecomposedFFI, Normalizer.Form.NFKC))); // => true
System.out.println(Normalizer.normalize(ffi, Normalizer.Form.NFKC)); // => ffi
System.out.println(Normalizer.normalize(ffi, Normalizer.Form.NFKD)); // => ffi
}
private static PrintStream print(String charset, String text) {
return System.out.printf("%-10s: %-12s Length %d\n", charset, text, text.length());
}
}
package main
import (
"fmt"
"unicode/utf8"
)
func main() {
world := "世界"
fmt.Println(len(world)) // => 6
fmt.Println(utf8.RuneCountInString(world)) // => 2
fmt.Printf("% X\n", world) // => E4 B8 96 E7 95 8C
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment