zshamrock · January 15, 2016 10:17
diff --git a/References b/References
 General:
 http://www.joelonsoftware.com/articles/Unicode.html

 Unicode:
 http://www.unicode.org/standard/WhatIsUnicode.html
 http://www.unicode.org/history/unicode88.pdf
 http://unicode.org/charts/
 http://unicode.org/cldr/utility/character.jsp
 https://en.wikipedia.org/wiki/List_of_Unicode_characters

 UTF:
 http://www.utf-8.com/
 http://www.cl.cam.ac.uk/~mgk25/ucs/utf-8-history.txt
 http://www.ietf.org/rfc/rfc3629.txt
 http://www.unicode.org/versions/Unicode7.0.0/ch03.pdf#G7404
 http://unicode.org/faq/utf_bom.html

 Java:
 http://docs.oracle.com/javase/7/docs/technotes/guides/intl/overview.html
 http://www.oracle.com/technetwork/articles/java/supplementary-142654.html

 Go:
 https://blog.golang.org/strings
 https://github.com/paulrosania/go-charset

 Normalization:
 https://blog.golang.org/normalization
 https://docs.oracle.com/javase/8/docs/api/java/text/Normalizer.html
 http://en.wikipedia.org/wiki/Unicode_equivalence
 http://unicode.org/reports/tr15/
diff --git a/surrogates.go b/surrogates.go
 package main

 // https://play.golang.org/p/XydB2AQ1dV

 import (
 	"fmt"
 	"unicode/utf8"
 	"unicode/utf16"
 	"bytes"
 )

 func main() {
 	r := int32(0x1030c)
 	r1, r2 := utf16.EncodeRune(r)	
 	fmt.Printf("%U %U %v %v %v\n", r1, r2, utf16.IsSurrogate(r), utf16.IsSurrogate(r1), utf16.IsSurrogate(r2)) // => U+D800 U+DF0C false true true
 	buf := bytes.Buffer{}
 	buf.WriteRune(r1)
 	buf.WriteRune(r2)
 	fmt.Printf("U+1030C is %s %X %X\n", buf.String(), r1, r2) // => U+1030C is �� D800 DF0C
 	bb := make([]byte, 4)
 	utf8.EncodeRune(bb, r)
 	buf.Reset()
 	buf.Write(bb)
 	fmt.Printf("U+1030C is %s, raw bytes %X %v\n", buf.String(), bb, bb) // => U+1030C is 𐌌, raw bytes F0908C8C [240 144 140 140]
 }
diff --git a/Unicode.java b/Unicode.java
 package experiments;

 import java.io.PrintStream;
 import java.nio.charset.StandardCharsets;
 import java.text.Normalizer;

 public class Unicode {

    public static void main(String[] args) {
        System.out.printf("%X%n", Character.codePointAt("世界", 0)); // => 4E16
        System.out.printf("%X%n", Character.codePointAt("世界", 1)); // => 754C
        byte[] helloWorld = new byte[]{0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x2c, 0x20,
                              (byte) 0xe4, (byte) 0xb8, (byte) 0x96, (byte) 0xe7, (byte) 0x95, (byte) 0x8c};
        System.out.printf("Byte array length: %d\n\n", helloWorld.length); // => Byte array length: 13

        String utf8 = new String(helloWorld, StandardCharsets.UTF_8);
        print("UTF-8", utf8); // => UTF-8     : Hello, 世界    Length 9

        String ascii = new String(helloWorld, StandardCharsets.US_ASCII);
        print("ASCII", ascii); // => ASCII     : Hello, ������ Length 13

        String utf16 = new String(helloWorld, StandardCharsets.UTF_16);
        print("UTF-16", utf16); // => UTF-16    : 䡥汬漬⃤뢖�      Length 7

        byte[] helloWorld16 = new byte[]{0x00, 0x48, 0x00, 0x65, 0x00, 0x6c, 0x00, 0x6c, 0x00, 0x6f, 0x00, 0x2c, 0x00, 0x20,
                0x4e, 0x16, 0x75, 0x4c};
        String utf16Correct = new String(helloWorld16, StandardCharsets.UTF_16);
        print("UTF-16", utf16Correct); // => UTF-16    : Hello, 世界    Length 9

        byte[] helloWorld16BE = new byte[]{(byte)0xfe, (byte)0xff, 0x00, 0x48, 0x00, 0x65, 0x00, 0x6c, 0x00, 0x6c, 0x00, 0x6f, 0x00, 0x2c, 0x00, 0x20,
                0x4e, 0x16, 0x75, 0x4c};
        String utf16BE1 = new String(helloWorld16BE, StandardCharsets.UTF_16);
        print("UTF-16 BE1", utf16BE1); // => UTF-16 BE1: Hello, 世界    Length 9

        String utf16BE2 = new String(helloWorld16BE, StandardCharsets.UTF_16BE);
        print("UTF-16 BE2", utf16BE2); // => UTF-16 BE2: Hello, 世界   Length 10

        String utf16LE1 = new String(helloWorld16BE, StandardCharsets.UTF_16LE);
        print("UTF-16 LE1", utf16LE1); // => UTF-16 LE1: �䠀攀氀氀漀Ⰰ ᙎ䱵   Length 10

        byte[] helloWorld16LE = new byte[]{(byte)0xff, (byte)0xfe, 0x48, 0x00, 0x65, 0x00, 0x6c, 0x00, 0x6c, 0x00, 0x6f, 0x00, 0x2c, 0x00, 0x20, 0x00,
                0x16, 0x4e, 0x4c, 0x75};

        String utf16LE2 = new String(helloWorld16LE, StandardCharsets.UTF_16);
        print("UTF-16 LE2", utf16LE2); // => UTF-16 LE2: Hello, 世界    Length 9

        String utf16LE3 = new String(helloWorld16LE, StandardCharsets.UTF_16LE);
        print("UTF-16 LE3", utf16LE3); // => UTF-16 LE3: Hello, 世界   Length 10

        // every char is 2 bytes (16 bit)
        System.out.printf("Hello, world is %s\n", "\u0048\u0065\u006c\u006c\u006f\u002c\u0020\u4e16\u754c"); // => Hello, world is Hello, 世界
        System.out.printf("World is %s\n", "世界"); // => World is 世界

        // surrogates
        System.out.println(new String(new byte[] {(byte) 0xd8, 0x00, (byte) 0xdf, 0x0c}, StandardCharsets.UTF_16));
        System.out.println("\ud800\udf02");
        char hs = Character.highSurrogate(0x10302);
        char ls = Character.lowSurrogate(0x10302);
        System.out.printf("%x %x\n", (int) hs, (int) ls); // => d800 df02

        // normalization
        String accentA = "\u00c1";
        String decomposedAccentA = "\u0041\u0301";
        System.out.printf("%s logical eq %s, but alone \u0041, and accent alone \u0301, but eq? %b\n", accentA, decomposedAccentA, accentA.equals(decomposedAccentA)); // => Á logical eq Á, but alone A, and accent alone ́, but eq? false

        String ffi = "\ufb03";
        String compatibleDecomposedFFI = "\u0066\u0066\u0069";
        System.out.printf("%s logical/semantically eq %s, but eq? %b\n", ffi, compatibleDecomposedFFI, ffi.equals(compatibleDecomposedFFI)); // => ﬃ logical/semantically eq ffi, but eq? false
        System.out.println(Normalizer.normalize(accentA, Normalizer.Form.NFD).length()); // => 2
        System.out.println(Normalizer.normalize(accentA, Normalizer.Form.NFC).length()); // => 1

        System.out.println(Normalizer.normalize(decomposedAccentA, Normalizer.Form.NFD).length()); // => 2
        System.out.println(Normalizer.normalize(decomposedAccentA, Normalizer.Form.NFC).length()); // => 1

        System.out.println(Normalizer.normalize(ffi, Normalizer.Form.NFKD).length()); // => 3
        System.out.println(Normalizer.normalize(ffi, Normalizer.Form.NFKC).length()); // => 3

        System.out.println(Normalizer.normalize(compatibleDecomposedFFI, Normalizer.Form.NFKD).length()); // => 3
        System.out.println(Normalizer.normalize(compatibleDecomposedFFI, Normalizer.Form.NFKC).length()); // => 3

        System.out.println(Normalizer.normalize(ffi, Normalizer.Form.NFD).length()); // => 1
        System.out.println(Normalizer.normalize(ffi, Normalizer.Form.NFC).length()); // => 1

        System.out.println(Normalizer.normalize(compatibleDecomposedFFI, Normalizer.Form.NFD).length()); // => 3
        System.out.println(Normalizer.normalize(compatibleDecomposedFFI, Normalizer.Form.NFC).length()); // => 3

        System.out.println(accentA.equals(Normalizer.normalize(decomposedAccentA, Normalizer.Form.NFC))); // => true
        System.out.println(Normalizer.normalize(ffi, Normalizer.Form.NFKC).equals(Normalizer.normalize(compatibleDecomposedFFI, Normalizer.Form.NFKC))); // => true
        System.out.println(Normalizer.normalize(ffi, Normalizer.Form.NFKC)); // => ffi
        System.out.println(Normalizer.normalize(ffi, Normalizer.Form.NFKD)); // => ffi
    }

    private static PrintStream print(String charset, String text) {
        return System.out.printf("%-10s: %-12s Length %d\n", charset, text, text.length());
    }
 }
diff --git a/world.go b/world.go
 package main

 import (
 	"fmt"
 	"unicode/utf8"
 )

 func main() {
 	world := "世界"
 	fmt.Println(len(world)) // => 6
 	fmt.Println(utf8.RuneCountInString(world)) // => 2
 	fmt.Printf("% X\n", world) // => E4 B8 96 E7 95 8C
 }
	General:
	http://www.joelonsoftware.com/articles/Unicode.html

	Unicode:
	http://www.unicode.org/standard/WhatIsUnicode.html
	http://www.unicode.org/history/unicode88.pdf
	http://unicode.org/charts/
	http://unicode.org/cldr/utility/character.jsp
	https://en.wikipedia.org/wiki/List_of_Unicode_characters

	UTF:
	http://www.utf-8.com/
	http://www.cl.cam.ac.uk/~mgk25/ucs/utf-8-history.txt
	http://www.ietf.org/rfc/rfc3629.txt
	http://www.unicode.org/versions/Unicode7.0.0/ch03.pdf#G7404
	http://unicode.org/faq/utf_bom.html

	Java:
	http://docs.oracle.com/javase/7/docs/technotes/guides/intl/overview.html
	http://www.oracle.com/technetwork/articles/java/supplementary-142654.html

	Go:
	https://blog.golang.org/strings
	https://github.com/paulrosania/go-charset

	Normalization:
	https://blog.golang.org/normalization
	https://docs.oracle.com/javase/8/docs/api/java/text/Normalizer.html
	http://en.wikipedia.org/wiki/Unicode_equivalence
	http://unicode.org/reports/tr15/
	package main

	// https://play.golang.org/p/XydB2AQ1dV

	import (
	"fmt"
	"unicode/utf8"
	"unicode/utf16"
	"bytes"
	)

	func main() {
	r := int32(0x1030c)
	r1, r2 := utf16.EncodeRune(r)
	fmt.Printf("%U %U %v %v %v\n", r1, r2, utf16.IsSurrogate(r), utf16.IsSurrogate(r1), utf16.IsSurrogate(r2)) // => U+D800 U+DF0C false true true
	buf := bytes.Buffer{}
	buf.WriteRune(r1)
	buf.WriteRune(r2)
	fmt.Printf("U+1030C is %s %X %X\n", buf.String(), r1, r2) // => U+1030C is �� D800 DF0C
	bb := make([]byte, 4)
	utf8.EncodeRune(bb, r)
	buf.Reset()
	buf.Write(bb)
	fmt.Printf("U+1030C is %s, raw bytes %X %v\n", buf.String(), bb, bb) // => U+1030C is 𐌌, raw bytes F0908C8C [240 144 140 140]
	}
	package experiments;

	import java.io.PrintStream;
	import java.nio.charset.StandardCharsets;
	import java.text.Normalizer;

	public class Unicode {

	public static void main(String[] args) {
	System.out.printf("%X%n", Character.codePointAt("世界", 0)); // => 4E16
	System.out.printf("%X%n", Character.codePointAt("世界", 1)); // => 754C
	byte[] helloWorld = new byte[]{0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x2c, 0x20,
	(byte) 0xe4, (byte) 0xb8, (byte) 0x96, (byte) 0xe7, (byte) 0x95, (byte) 0x8c};
	System.out.printf("Byte array length: %d\n\n", helloWorld.length); // => Byte array length: 13

	String utf8 = new String(helloWorld, StandardCharsets.UTF_8);
	print("UTF-8", utf8); // => UTF-8 : Hello, 世界 Length 9

	String ascii = new String(helloWorld, StandardCharsets.US_ASCII);
	print("ASCII", ascii); // => ASCII : Hello, �� Length 13

	String utf16 = new String(helloWorld, StandardCharsets.UTF_16);
	print("UTF-16", utf16); // => UTF-16 : 䡥汬漬⃤뢖� Length 7

	byte[] helloWorld16 = new byte[]{0x00, 0x48, 0x00, 0x65, 0x00, 0x6c, 0x00, 0x6c, 0x00, 0x6f, 0x00, 0x2c, 0x00, 0x20,
	0x4e, 0x16, 0x75, 0x4c};
	String utf16Correct = new String(helloWorld16, StandardCharsets.UTF_16);
	print("UTF-16", utf16Correct); // => UTF-16 : Hello, 世界 Length 9

	byte[] helloWorld16BE = new byte[]{(byte)0xfe, (byte)0xff, 0x00, 0x48, 0x00, 0x65, 0x00, 0x6c, 0x00, 0x6c, 0x00, 0x6f, 0x00, 0x2c, 0x00, 0x20,
	0x4e, 0x16, 0x75, 0x4c};
	String utf16BE1 = new String(helloWorld16BE, StandardCharsets.UTF_16);
	print("UTF-16 BE1", utf16BE1); // => UTF-16 BE1: Hello, 世界 Length 9

	String utf16BE2 = new String(helloWorld16BE, StandardCharsets.UTF_16BE);
	print("UTF-16 BE2", utf16BE2); // => UTF-16 BE2: Hello, 世界 Length 10

	String utf16LE1 = new String(helloWorld16BE, StandardCharsets.UTF_16LE);
	print("UTF-16 LE1", utf16LE1); // => UTF-16 LE1: �䠀攀氀氀漀Ⰰ ᙎ䱵 Length 10

	byte[] helloWorld16LE = new byte[]{(byte)0xff, (byte)0xfe, 0x48, 0x00, 0x65, 0x00, 0x6c, 0x00, 0x6c, 0x00, 0x6f, 0x00, 0x2c, 0x00, 0x20, 0x00,
	0x16, 0x4e, 0x4c, 0x75};

	String utf16LE2 = new String(helloWorld16LE, StandardCharsets.UTF_16);
	print("UTF-16 LE2", utf16LE2); // => UTF-16 LE2: Hello, 世界 Length 9

	String utf16LE3 = new String(helloWorld16LE, StandardCharsets.UTF_16LE);
	print("UTF-16 LE3", utf16LE3); // => UTF-16 LE3: Hello, 世界 Length 10

	// every char is 2 bytes (16 bit)
	System.out.printf("Hello, world is %s\n", "\u0048\u0065\u006c\u006c\u006f\u002c\u0020\u4e16\u754c"); // => Hello, world is Hello, 世界
	System.out.printf("World is %s\n", "世界"); // => World is 世界

	// surrogates
	System.out.println(new String(new byte[] {(byte) 0xd8, 0x00, (byte) 0xdf, 0x0c}, StandardCharsets.UTF_16));
	System.out.println("\ud800\udf02");
	char hs = Character.highSurrogate(0x10302);
	char ls = Character.lowSurrogate(0x10302);
	System.out.printf("%x %x\n", (int) hs, (int) ls); // => d800 df02

	// normalization
	String accentA = "\u00c1";
	String decomposedAccentA = "\u0041\u0301";
	System.out.printf("%s logical eq %s, but alone \u0041, and accent alone \u0301, but eq? %b\n", accentA, decomposedAccentA, accentA.equals(decomposedAccentA)); // => Á logical eq Á, but alone A, and accent alone ́, but eq? false

	String ffi = "\ufb03";
	String compatibleDecomposedFFI = "\u0066\u0066\u0069";
	System.out.printf("%s logical/semantically eq %s, but eq? %b\n", ffi, compatibleDecomposedFFI, ffi.equals(compatibleDecomposedFFI)); // => ﬃ logical/semantically eq ffi, but eq? false
	System.out.println(Normalizer.normalize(accentA, Normalizer.Form.NFD).length()); // => 2
	System.out.println(Normalizer.normalize(accentA, Normalizer.Form.NFC).length()); // => 1

	System.out.println(Normalizer.normalize(decomposedAccentA, Normalizer.Form.NFD).length()); // => 2
	System.out.println(Normalizer.normalize(decomposedAccentA, Normalizer.Form.NFC).length()); // => 1

	System.out.println(Normalizer.normalize(ffi, Normalizer.Form.NFKD).length()); // => 3
	System.out.println(Normalizer.normalize(ffi, Normalizer.Form.NFKC).length()); // => 3

	System.out.println(Normalizer.normalize(compatibleDecomposedFFI, Normalizer.Form.NFKD).length()); // => 3
	System.out.println(Normalizer.normalize(compatibleDecomposedFFI, Normalizer.Form.NFKC).length()); // => 3

	System.out.println(Normalizer.normalize(ffi, Normalizer.Form.NFD).length()); // => 1
	System.out.println(Normalizer.normalize(ffi, Normalizer.Form.NFC).length()); // => 1

	System.out.println(Normalizer.normalize(compatibleDecomposedFFI, Normalizer.Form.NFD).length()); // => 3
	System.out.println(Normalizer.normalize(compatibleDecomposedFFI, Normalizer.Form.NFC).length()); // => 3

	System.out.println(accentA.equals(Normalizer.normalize(decomposedAccentA, Normalizer.Form.NFC))); // => true
	System.out.println(Normalizer.normalize(ffi, Normalizer.Form.NFKC).equals(Normalizer.normalize(compatibleDecomposedFFI, Normalizer.Form.NFKC))); // => true
	System.out.println(Normalizer.normalize(ffi, Normalizer.Form.NFKC)); // => ffi
	System.out.println(Normalizer.normalize(ffi, Normalizer.Form.NFKD)); // => ffi
	}

	private static PrintStream print(String charset, String text) {
	return System.out.printf("%-10s: %-12s Length %d\n", charset, text, text.length());
	}
	}