Created
September 9, 2023 22:19
-
-
Save alexanderankin/bd35d15965718524429cc8508580ff02 to your computer and use it in GitHub Desktop.
java utf8 ideas
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| package java_utf.utf8; | |
| import java.nio.charset.StandardCharsets; | |
| import java.util.*; | |
| import java.util.stream.IntStream; | |
| import java.util.stream.StreamSupport; | |
| public class Utf8Strings { | |
| public static void main(String[] args) { | |
| byte[] bytes = "abc".getBytes(StandardCharsets.UTF_8); | |
| Utf8String utf8String = new Utf8String(bytes); | |
| /* | |
| System.out.println(utf8String.length()); | |
| for (int i : utf8String.chars().toArray()) { | |
| System.out.println((char) i); | |
| } | |
| */ | |
| System.out.println(utf8String.equals( | |
| new Utf8String(Utf8String.encode(utf8String.decode())) | |
| )); | |
| for (String s : Arrays.asList("a", "", "abc", "📙")) { | |
| utf8String = new Utf8String(s.getBytes(StandardCharsets.UTF_8)); | |
| System.out.println(utf8String.equals( | |
| new Utf8String(Utf8String.encode(utf8String.decode())) | |
| )); | |
| utf8String = new Utf8String(Utf8String.encode(s.chars().toArray())); | |
| System.out.println(utf8String.equals( | |
| new Utf8String(Utf8String.encode(utf8String.decode())) | |
| )); | |
| } | |
| } | |
| @SuppressWarnings("unused") | |
| public interface Utf8CharSequence { | |
| static int compare(Utf8CharSequence cs1, Utf8CharSequence cs2) { | |
| return Utf8CharSequenceComparator.INSTANCE.compare(cs1, cs2); | |
| } | |
| // number of code points | |
| int length(); | |
| // codepoint | |
| int charAt(int index); | |
| // access byte array (why?) | |
| byte byteAt(int index); | |
| default boolean isEmpty() { | |
| return this.length() == 0; | |
| } | |
| Utf8CharSequence subSequence(int start, int end); | |
| Utf8String toUtf8String(); | |
| default IntStream chars() { | |
| return StreamSupport.intStream(() -> | |
| Spliterators.spliterator( | |
| new Utf8CsCharIterator(this), | |
| length(), | |
| Spliterator.ORDERED), | |
| Spliterator.SUBSIZED | Spliterator.SIZED | Spliterator.ORDERED, | |
| false); | |
| } | |
| } | |
| public static class Utf8CharSequenceComparator implements Comparator<Utf8CharSequence> { | |
| public static final Utf8CharSequenceComparator INSTANCE = new Utf8CharSequenceComparator(); | |
| @Override | |
| public int compare(Utf8CharSequence cs1, Utf8CharSequence cs2) { | |
| if (Objects.requireNonNull(cs1) == Objects.requireNonNull(cs2)) { | |
| return 0; | |
| } | |
| if (cs1.getClass() == cs2.getClass() && cs1 instanceof Comparable) { | |
| @SuppressWarnings("unchecked") | |
| Comparable<Object> comparable = (Comparable<Object>) cs1; | |
| return comparable.compareTo(cs2); | |
| } | |
| for (int i = 0, len = Math.min(cs1.length(), cs2.length()); i < len; i++) { | |
| int a = cs1.charAt(i); | |
| int b = cs2.charAt(i); | |
| if (a != b) { | |
| return a - b; | |
| } | |
| } | |
| return cs1.length() - cs2.length(); | |
| } | |
| } | |
| static class Utf8CsCharIterator implements PrimitiveIterator.OfInt { | |
| final Utf8CharSequence charSequence; | |
| int position = 0; | |
| Utf8CsCharIterator(Utf8CharSequence charSequence) { | |
| this.charSequence = charSequence; | |
| } | |
| @Override | |
| public int nextInt() { | |
| return charSequence.charAt(position++); | |
| } | |
| @Override | |
| public boolean hasNext() { | |
| return position < charSequence.length(); | |
| } | |
| } | |
| @SuppressWarnings("unused") | |
| static public class Utf8Character { | |
| public static final int MIN_HIGH_SURROGATE = '\uD800'; | |
| public static final int MAX_HIGH_SURROGATE = '\uDBFF'; | |
| public static final int MIN_LOW_SURROGATE = '\uDC00'; | |
| public static final int MAX_LOW_SURROGATE = '\uDFFF'; | |
| public static boolean isHighSurrogate(int ch) { | |
| return ch >= MIN_HIGH_SURROGATE && ch < (MAX_HIGH_SURROGATE + 1); | |
| } | |
| public static boolean isLowSurrogate(int ch) { | |
| return ch >= MIN_LOW_SURROGATE && ch < (MAX_LOW_SURROGATE + 1); | |
| } | |
| public static int toCodePoint(int c1, int c2) { | |
| return 0; | |
| } | |
| } | |
| static public class Utf8String implements Utf8CharSequence { | |
| final byte[] value; | |
| public Utf8String(byte[] value) { | |
| this.value = value; | |
| } | |
| static byte[] encode(int[] codepoints) { | |
| ByteArray byteArray = new ByteArray(codepoints.length); | |
| /* | |
| is casting to byte okay here? | |
| or is signed byte going to break things? | |
| https://gist.github.com/MightyPork/52eda3e5677b4b03524e40c9f0ab1da5 | |
| */ | |
| for (int utf : codepoints) { | |
| if (utf <= 0x7F) { | |
| // Plain ASCII | |
| byteArray.add((byte) utf); | |
| } else if (utf <= 0x07FF) { | |
| // 2-byte unicode | |
| byteArray.add((byte) (((utf >> 6) & 0x1F) | 0xC0)); | |
| byteArray.add((byte) (((utf) & 0x3F) | 0x80)); | |
| } else if (utf <= 0xFFFF) { | |
| // 3-byte unicode | |
| byteArray.add((byte) (((utf >> 12) & 0x0F) | 0xE0)); | |
| byteArray.add((byte) (((utf >> 6) & 0x3F) | 0x80)); | |
| byteArray.add((byte) (((utf) & 0x3F) | 0x80)); | |
| } else if (utf <= 0x10FFFF) { | |
| // 4-byte unicode | |
| byteArray.add((byte) (((utf >> 18) & 0x07) | 0xF0)); | |
| byteArray.add((byte) (((utf >> 12) & 0x3F) | 0x80)); | |
| byteArray.add((byte) (((utf >> 6) & 0x3F) | 0x80)); | |
| byteArray.add((byte) (((utf) & 0x3F) | 0x80)); | |
| } else { | |
| // error - use replacement character | |
| byteArray.add((byte) 0xEF); | |
| byteArray.add((byte) 0xBF); | |
| byteArray.add((byte) 0xBD); | |
| } | |
| } | |
| return byteArray.toArray(); | |
| } | |
| static int[] decode(byte[] value) { | |
| // https://news.ycombinator.com/item?id=15425046 | |
| // uint32_t v = *(uint32_t*)s; | |
| // *c = (uint32_t)(v & masks[len]) << 18; | |
| // *c |= (uint32_t)((v>>8) & 0x3f) << 12; | |
| // *c |= (uint32_t)((v>>16) & 0x3f) << 6; | |
| // *c |= (uint32_t)((v>>24) & 0x3f) << 0; | |
| // *c >>= shiftc[len]; | |
| // https://nullprogram.com/blog/2017/10/06/ | |
| // https://gist.github.com/s4y/7c95f1ebeb2c069cfb09db3c3251eca3 | |
| /* | |
| unsigned char buf[1024 * 1024]; | |
| size_t count = 0; | |
| size_t avail; | |
| uint32_t codepoint_hash = 0; | |
| utf8_decode_context_t state = { UTF8_OK, 0 }; | |
| while ((avail = read(STDIN_FILENO, buf, sizeof(buf) / sizeof(*buf))) > 0) { | |
| for (size_t i = 0; i < avail; i++) { | |
| utf8_decode(&state, buf[i]); | |
| switch (state.state) { | |
| case UTF8_OK: | |
| count += 1; | |
| codepoint_hash ^= state.codepoint; | |
| break; | |
| case UTF8_ERROR: | |
| return 1; | |
| } | |
| } | |
| } | |
| if (state.state != UTF8_OK) { | |
| return 2; | |
| } | |
| printf("Decoded %zu code points and got a hash of %u.\n", count, codepoint_hash); | |
| */ | |
| int[] codepoints = new int[value.length]; | |
| int i = 0; | |
| int UTF8_ERROR = 4; | |
| int state = 0; | |
| for (int b : value) { | |
| switch (state) { | |
| case 0 -> { | |
| if (b < 0x80) { | |
| codepoints[i] = b; | |
| } else if (b < 0xc0) { | |
| state = UTF8_ERROR; | |
| } else if (b < 0xe0) { | |
| state = 1; | |
| codepoints[i] = b & 0x1f; | |
| } else if (b < 0xf0) { | |
| state = 2; | |
| codepoints[i] = b & 0xf; | |
| } else if (b < 0xf8) { | |
| state = 3; | |
| codepoints[i] = b & 0x7; | |
| } else { | |
| state = UTF8_ERROR; | |
| } | |
| } | |
| case 1, 2, 3 -> { | |
| if (b >= 0x80 && b <= 0xbf) { | |
| state -= 1; | |
| codepoints[i] = | |
| (codepoints[i] << 6) | (b & 0x3f); | |
| } else { | |
| state = UTF8_ERROR; | |
| } | |
| } | |
| } | |
| switch (state) { | |
| case 0 -> i++; | |
| case 4 -> throw new IllegalArgumentException("bad utf8 bytes"); | |
| } | |
| } | |
| return codepoints; | |
| } | |
| byte[] encode() { | |
| return value; | |
| } | |
| int[] decode() { | |
| return decode(value); | |
| } | |
| @Override | |
| public int length() { | |
| return decode().length; | |
| } | |
| @Override | |
| public int charAt(int index) { | |
| return decode()[index]; | |
| } | |
| @Override | |
| public byte byteAt(int index) { | |
| return value[index]; | |
| } | |
| @Override | |
| public Utf8String subSequence(int start, int end) { | |
| return new Utf8String(Arrays.copyOfRange(value, start, end)); | |
| } | |
| @Override | |
| public Utf8String toUtf8String() { | |
| return this; | |
| } | |
| @Override | |
| public int hashCode() { | |
| return Arrays.hashCode(value); | |
| } | |
| @Override | |
| public boolean equals(Object obj) { | |
| if (obj == this) return true; | |
| if (!(obj instanceof Utf8String other)) return false; | |
| if (!other.canEqual(this)) return false; | |
| return Arrays.equals(value, other.value); | |
| } | |
| // https://projectlombok.org/features/EqualsAndHashCode | |
| protected boolean canEqual(Object other) { | |
| return other instanceof Utf8String; | |
| } | |
| @Override | |
| public String toString() { | |
| return new String(value, StandardCharsets.UTF_8); | |
| } | |
| } | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment