Skip to content

Instantly share code, notes, and snippets.

@alexanderankin
Created September 9, 2023 22:19
Show Gist options
  • Select an option

  • Save alexanderankin/bd35d15965718524429cc8508580ff02 to your computer and use it in GitHub Desktop.

Select an option

Save alexanderankin/bd35d15965718524429cc8508580ff02 to your computer and use it in GitHub Desktop.
java utf8 ideas
package java_utf.utf8;
import java.nio.charset.StandardCharsets;
import java.util.*;
import java.util.stream.IntStream;
import java.util.stream.StreamSupport;
public class Utf8Strings {
public static void main(String[] args) {
byte[] bytes = "abc".getBytes(StandardCharsets.UTF_8);
Utf8String utf8String = new Utf8String(bytes);
/*
System.out.println(utf8String.length());
for (int i : utf8String.chars().toArray()) {
System.out.println((char) i);
}
*/
System.out.println(utf8String.equals(
new Utf8String(Utf8String.encode(utf8String.decode()))
));
for (String s : Arrays.asList("a", "", "abc", "📙")) {
utf8String = new Utf8String(s.getBytes(StandardCharsets.UTF_8));
System.out.println(utf8String.equals(
new Utf8String(Utf8String.encode(utf8String.decode()))
));
utf8String = new Utf8String(Utf8String.encode(s.chars().toArray()));
System.out.println(utf8String.equals(
new Utf8String(Utf8String.encode(utf8String.decode()))
));
}
}
@SuppressWarnings("unused")
public interface Utf8CharSequence {
static int compare(Utf8CharSequence cs1, Utf8CharSequence cs2) {
return Utf8CharSequenceComparator.INSTANCE.compare(cs1, cs2);
}
// number of code points
int length();
// codepoint
int charAt(int index);
// access byte array (why?)
byte byteAt(int index);
default boolean isEmpty() {
return this.length() == 0;
}
Utf8CharSequence subSequence(int start, int end);
Utf8String toUtf8String();
default IntStream chars() {
return StreamSupport.intStream(() ->
Spliterators.spliterator(
new Utf8CsCharIterator(this),
length(),
Spliterator.ORDERED),
Spliterator.SUBSIZED | Spliterator.SIZED | Spliterator.ORDERED,
false);
}
}
public static class Utf8CharSequenceComparator implements Comparator<Utf8CharSequence> {
public static final Utf8CharSequenceComparator INSTANCE = new Utf8CharSequenceComparator();
@Override
public int compare(Utf8CharSequence cs1, Utf8CharSequence cs2) {
if (Objects.requireNonNull(cs1) == Objects.requireNonNull(cs2)) {
return 0;
}
if (cs1.getClass() == cs2.getClass() && cs1 instanceof Comparable) {
@SuppressWarnings("unchecked")
Comparable<Object> comparable = (Comparable<Object>) cs1;
return comparable.compareTo(cs2);
}
for (int i = 0, len = Math.min(cs1.length(), cs2.length()); i < len; i++) {
int a = cs1.charAt(i);
int b = cs2.charAt(i);
if (a != b) {
return a - b;
}
}
return cs1.length() - cs2.length();
}
}
static class Utf8CsCharIterator implements PrimitiveIterator.OfInt {
final Utf8CharSequence charSequence;
int position = 0;
Utf8CsCharIterator(Utf8CharSequence charSequence) {
this.charSequence = charSequence;
}
@Override
public int nextInt() {
return charSequence.charAt(position++);
}
@Override
public boolean hasNext() {
return position < charSequence.length();
}
}
@SuppressWarnings("unused")
static public class Utf8Character {
public static final int MIN_HIGH_SURROGATE = '\uD800';
public static final int MAX_HIGH_SURROGATE = '\uDBFF';
public static final int MIN_LOW_SURROGATE = '\uDC00';
public static final int MAX_LOW_SURROGATE = '\uDFFF';
public static boolean isHighSurrogate(int ch) {
return ch >= MIN_HIGH_SURROGATE && ch < (MAX_HIGH_SURROGATE + 1);
}
public static boolean isLowSurrogate(int ch) {
return ch >= MIN_LOW_SURROGATE && ch < (MAX_LOW_SURROGATE + 1);
}
public static int toCodePoint(int c1, int c2) {
return 0;
}
}
static public class Utf8String implements Utf8CharSequence {
final byte[] value;
public Utf8String(byte[] value) {
this.value = value;
}
static byte[] encode(int[] codepoints) {
ByteArray byteArray = new ByteArray(codepoints.length);
/*
is casting to byte okay here?
or is signed byte going to break things?
https://gist.github.com/MightyPork/52eda3e5677b4b03524e40c9f0ab1da5
*/
for (int utf : codepoints) {
if (utf <= 0x7F) {
// Plain ASCII
byteArray.add((byte) utf);
} else if (utf <= 0x07FF) {
// 2-byte unicode
byteArray.add((byte) (((utf >> 6) & 0x1F) | 0xC0));
byteArray.add((byte) (((utf) & 0x3F) | 0x80));
} else if (utf <= 0xFFFF) {
// 3-byte unicode
byteArray.add((byte) (((utf >> 12) & 0x0F) | 0xE0));
byteArray.add((byte) (((utf >> 6) & 0x3F) | 0x80));
byteArray.add((byte) (((utf) & 0x3F) | 0x80));
} else if (utf <= 0x10FFFF) {
// 4-byte unicode
byteArray.add((byte) (((utf >> 18) & 0x07) | 0xF0));
byteArray.add((byte) (((utf >> 12) & 0x3F) | 0x80));
byteArray.add((byte) (((utf >> 6) & 0x3F) | 0x80));
byteArray.add((byte) (((utf) & 0x3F) | 0x80));
} else {
// error - use replacement character
byteArray.add((byte) 0xEF);
byteArray.add((byte) 0xBF);
byteArray.add((byte) 0xBD);
}
}
return byteArray.toArray();
}
static int[] decode(byte[] value) {
// https://news.ycombinator.com/item?id=15425046
// uint32_t v = *(uint32_t*)s;
// *c = (uint32_t)(v & masks[len]) << 18;
// *c |= (uint32_t)((v>>8) & 0x3f) << 12;
// *c |= (uint32_t)((v>>16) & 0x3f) << 6;
// *c |= (uint32_t)((v>>24) & 0x3f) << 0;
// *c >>= shiftc[len];
// https://nullprogram.com/blog/2017/10/06/
// https://gist.github.com/s4y/7c95f1ebeb2c069cfb09db3c3251eca3
/*
unsigned char buf[1024 * 1024];
size_t count = 0;
size_t avail;
uint32_t codepoint_hash = 0;
utf8_decode_context_t state = { UTF8_OK, 0 };
while ((avail = read(STDIN_FILENO, buf, sizeof(buf) / sizeof(*buf))) > 0) {
for (size_t i = 0; i < avail; i++) {
utf8_decode(&state, buf[i]);
switch (state.state) {
case UTF8_OK:
count += 1;
codepoint_hash ^= state.codepoint;
break;
case UTF8_ERROR:
return 1;
}
}
}
if (state.state != UTF8_OK) {
return 2;
}
printf("Decoded %zu code points and got a hash of %u.\n", count, codepoint_hash);
*/
int[] codepoints = new int[value.length];
int i = 0;
int UTF8_ERROR = 4;
int state = 0;
for (int b : value) {
switch (state) {
case 0 -> {
if (b < 0x80) {
codepoints[i] = b;
} else if (b < 0xc0) {
state = UTF8_ERROR;
} else if (b < 0xe0) {
state = 1;
codepoints[i] = b & 0x1f;
} else if (b < 0xf0) {
state = 2;
codepoints[i] = b & 0xf;
} else if (b < 0xf8) {
state = 3;
codepoints[i] = b & 0x7;
} else {
state = UTF8_ERROR;
}
}
case 1, 2, 3 -> {
if (b >= 0x80 && b <= 0xbf) {
state -= 1;
codepoints[i] =
(codepoints[i] << 6) | (b & 0x3f);
} else {
state = UTF8_ERROR;
}
}
}
switch (state) {
case 0 -> i++;
case 4 -> throw new IllegalArgumentException("bad utf8 bytes");
}
}
return codepoints;
}
byte[] encode() {
return value;
}
int[] decode() {
return decode(value);
}
@Override
public int length() {
return decode().length;
}
@Override
public int charAt(int index) {
return decode()[index];
}
@Override
public byte byteAt(int index) {
return value[index];
}
@Override
public Utf8String subSequence(int start, int end) {
return new Utf8String(Arrays.copyOfRange(value, start, end));
}
@Override
public Utf8String toUtf8String() {
return this;
}
@Override
public int hashCode() {
return Arrays.hashCode(value);
}
@Override
public boolean equals(Object obj) {
if (obj == this) return true;
if (!(obj instanceof Utf8String other)) return false;
if (!other.canEqual(this)) return false;
return Arrays.equals(value, other.value);
}
// https://projectlombok.org/features/EqualsAndHashCode
protected boolean canEqual(Object other) {
return other instanceof Utf8String;
}
@Override
public String toString() {
return new String(value, StandardCharsets.UTF_8);
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment